MSDS 7331 Lab Two: Classification¶
Authors: Jaren Shead, Kristin Henderson, Tom Hines¶
Setup & Data Import¶
# Essential Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import re
import time
# Machine Learning Libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor, Lasso, LassoCV
from sklearn.svm import SVC, SVR, LinearSVR
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRegressor
from sklearn import metrics as mt
from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.experimental import enable_halving_search_cv # Enable HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV
from scipy.stats import ttest_rel
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multitest import multipletests
# Display plots inline
%matplotlib inline
# Load dataset
df = pd.read_csv('data/diabetes+130-us+hospitals+for+years+1999-2008/diabetic_data.csv')
df.head()
| encounter_id | patient_nbr | race | gender | age | weight | admission_type_id | discharge_disposition_id | admission_source_id | time_in_hospital | ... | citoglipton | insulin | glyburide-metformin | glipizide-metformin | glimepiride-pioglitazone | metformin-rosiglitazone | metformin-pioglitazone | change | diabetesMed | readmitted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2278392 | 8222157 | Caucasian | Female | [0-10) | ? | 6 | 25 | 1 | 1 | ... | No | No | No | No | No | No | No | No | No | NO |
| 1 | 149190 | 55629189 | Caucasian | Female | [10-20) | ? | 1 | 1 | 7 | 3 | ... | No | Up | No | No | No | No | No | Ch | Yes | >30 |
| 2 | 64410 | 86047875 | AfricanAmerican | Female | [20-30) | ? | 1 | 1 | 7 | 2 | ... | No | No | No | No | No | No | No | No | Yes | NO |
| 3 | 500364 | 82442376 | Caucasian | Male | [30-40) | ? | 1 | 1 | 7 | 2 | ... | No | Up | No | No | No | No | No | Ch | Yes | NO |
| 4 | 16680 | 42519267 | Caucasian | Male | [40-50) | ? | 1 | 1 | 7 | 1 | ... | No | Steady | No | No | No | No | No | Ch | Yes | NO |
5 rows × 50 columns
Data Preparation¶
Data Cleaning & Preprocessing¶
# Make a copy of the dataset
df_clean = df.copy()
# Replace '?' with NaN
df_clean.replace('?', np.nan, inplace=True)
# Fill missing values
df_clean[['medical_specialty', 'payer_code', 'race']] = df_clean[['medical_specialty', 'payer_code', 'race']].fillna('Unknown')
df_clean[['diag_1', 'diag_2', 'diag_3']] = df_clean[['diag_1', 'diag_2', 'diag_3']].fillna('Unknown/None')
df_clean[['max_glu_serum', 'A1Cresult']] = df_clean[['max_glu_serum', 'A1Cresult']].fillna('Untested')
# Convert numeric categorical columns to strings explicitly (not category yet)
numeric_categorical_cols = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']
df_clean[numeric_categorical_cols] = df_clean[numeric_categorical_cols].astype(str)
# Drop unnecessary columns
df_clean.drop(columns=['encounter_id', 'examide', 'citoglipton', 'weight', 'patient_nbr'], inplace=True)
# Define ordinal category orders
category_orders = {
'readmitted': ['<30', '>30', 'NO'],
'max_glu_serum': ['Untested', 'Norm', '>200', '>300'],
'A1Cresult': ['Untested', 'Norm', '>7', '>8'],
'age': ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)',
'[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']
}
# Convert ordinal variables
for col, order in category_orders.items():
df_clean[col] = pd.Categorical(df_clean[col], categories=order, ordered=True)
# Convert drug variables to ordinal categories
drug_order = ['No', 'Down', 'Steady', 'Up']
drug_cols = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'tolazamide',
'pioglitazone', 'rosiglitazone', 'troglitazone', 'acarbose', 'miglitol',
'insulin', 'glyburide-metformin', 'glipizide-metformin',
'metformin-rosiglitazone', 'metformin-pioglitazone', 'glimepiride-pioglitazone']
for col in drug_cols:
df_clean[col] = pd.Categorical(df_clean[col], categories=drug_order, ordered=True)
# Preprocess diag_1, diag_2, diag_3 combining all codes with decimals under their integer values
for col in ['diag_1', 'diag_2', 'diag_3']:
df_clean[col] = df_clean[col].str.split('.').str[0] # Drop decimals and digits after
df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 101766 entries, 0 to 101765 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 race 101766 non-null object 1 gender 101766 non-null object 2 age 101766 non-null category 3 admission_type_id 101766 non-null object 4 discharge_disposition_id 101766 non-null object 5 admission_source_id 101766 non-null object 6 time_in_hospital 101766 non-null int64 7 payer_code 101766 non-null object 8 medical_specialty 101766 non-null object 9 num_lab_procedures 101766 non-null int64 10 num_procedures 101766 non-null int64 11 num_medications 101766 non-null int64 12 number_outpatient 101766 non-null int64 13 number_emergency 101766 non-null int64 14 number_inpatient 101766 non-null int64 15 diag_1 101766 non-null object 16 diag_2 101766 non-null object 17 diag_3 101766 non-null object 18 number_diagnoses 101766 non-null int64 19 max_glu_serum 101766 non-null category 20 A1Cresult 101766 non-null category 21 metformin 101766 non-null category 22 repaglinide 101766 non-null category 23 nateglinide 101766 non-null category 24 chlorpropamide 101766 non-null category 25 glimepiride 101766 non-null category 26 acetohexamide 101766 non-null category 27 glipizide 101766 non-null category 28 glyburide 101766 non-null category 29 tolbutamide 101766 non-null category 30 pioglitazone 101766 non-null category 31 rosiglitazone 101766 non-null category 32 acarbose 101766 non-null category 33 miglitol 101766 non-null category 34 troglitazone 101766 non-null category 35 tolazamide 101766 non-null category 36 insulin 101766 non-null category 37 glyburide-metformin 101766 non-null category 38 glipizide-metformin 101766 non-null category 39 glimepiride-pioglitazone 101766 non-null category 40 metformin-rosiglitazone 101766 non-null category 41 metformin-pioglitazone 101766 non-null category 42 change 101766 non-null object 43 diabetesMed 101766 non-null object 44 readmitted 101766 non-null category dtypes: category(25), int64(8), object(12) memory usage: 18.0+ MB
print(df_clean['change'].describe())
print(df_clean['change'].value_counts())
count 101766 unique 2 top No freq 54755 Name: change, dtype: object change No 54755 Ch 47011 Name: count, dtype: int64
Classification Task¶
Our first task is predicting if a patient will have a change in medication, the variable change. From an insurance company's perspective, this may be an important thing to predict to distribute resources and identify inaccuracies to cut costs.
EDA & Visualizations of the Response Variable change¶
This is a quick look at the distribution of each numerical variable for patients with a change and those without.
change_order = ['Ch', 'No']
# List of numeric columns
numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
# Set the color palette
palette = sns.color_palette("muted", n_colors=len(change_order))
# Generate plots for each numeric column
for col in numeric_cols:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
# Histogram with proportions
for idx, category in enumerate(change_order):
subset = df_clean[df_clean['change'] == category]
sns.histplot(subset[col], bins=20, color=palette[idx], label=category, alpha=0.5, stat="probability", ax=axes[0])
axes[0].set_title(f"Histogram of {col} (Proportions) by Response Class")
axes[0].set_xlabel(col)
axes[0].set_ylabel('Proportion')
axes[0].legend(title='Medication Change')
# Boxplot
sns.boxplot(data=df_clean, x='change', y=col, hue='change', palette=palette[:len(change_order)], ax=axes[1])
axes[1].set_title(f"Boxplot of {col} by Response Class")
axes[1].set_xlabel('Medication Change')
axes[1].set_ylabel(col)
plt.tight_layout()
# plt.savefig(f'plots/plot_{col}_by_readmitted.png', bbox_inches="tight")
plt.show()
This create cross-tabulations for each categorical variable by response class, showing the percentage of patients in each category.
subset_df = df_clean.select_dtypes(include=['object', 'category'])
# subset_df = subset_df.drop(['medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'patient_nbr', 'admission_type_id', 'max_glu_serum', 'A1Cresult'], axis=1)
# subset_df.columns
response = 'change' # Attribute of interest
other_columns = [col for col in subset_df.columns if col != response] # Exclude target
crosstab_results = {}
for column in other_columns:
crosstab_results[column] = pd.crosstab(subset_df[response], subset_df[column])
def create_heatmaps(crosstab_results, response, normalization='none', n_cols=4, cmap="plasma"):
"""
Creates a grid of heatmaps for given crosstab results.
Parameters:
crosstab_results (dict): Dictionary of crosstabs for each feature.
response (str): Name of the response variable.
normalization (str): Normalization type ('row', 'column', 'overall').
n_cols (int): Number of columns in the subplot grid.
cmap (str): Colormap for the heatmaps.
"""
n_features = len(crosstab_results) # Total features to plot
n_rows = -(-n_features // n_cols) # Compute rows by rounding up
# Create the figure and axes for subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows), sharex=False, sharey=False)
axes = np.ravel(axes) # Flatten axes for consistent 1D handling
# Loop through crosstabs and create heatmaps
for i, (col, crosstab) in enumerate(crosstab_results.items()):
# Normalize the crosstab based on the selected normalization
if normalization == 'none':
normalized_crosstab = crosstab
elif normalization == 'row':
normalized_crosstab = crosstab.div(crosstab.sum(axis=1), axis=0) # Normalize by row
elif normalization == 'column':
normalized_crosstab = crosstab.div(crosstab.sum(axis=0), axis=1) # Normalize by column
elif normalization == 'overall':
normalized_crosstab = crosstab / crosstab.values.sum() # Normalize overall
else:
raise ValueError("Invalid normalization type. Choose 'none', row', 'column', or 'overall'.")
# Plot heatmap
sns.heatmap(normalized_crosstab, annot=True, fmt=".2f" if normalization != 'none' else "d", cmap=cmap, cbar=False, ax=axes[i])
axes[i].set_title(f"Crosstab of {response} vs {col}")
axes[i].set_xlabel(col)
axes[i].set_ylabel(response)
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout for better spacing
plt.tight_layout()
# plt.savefig(f'plots/crossTab_heatmaps.png')
plt.show()
# create_heatmaps(crosstab_results, response='readmitted', normalization='none', n_cols=4, cmap="plasma") # Crosstab Heatmap: No Normalization, Raw Counts
# create_heatmaps(crosstab_results, response='readmitted', normalization='row', n_cols=4, cmap="plasma") # Crosstab Heatmap: No Normalization, Proportion of Factor Levels within the Response Variable
# create_heatmaps(crosstab_results, response='readmitted', normalization='column', n_cols=4, cmap="plasma") # Crosstab Heatmap: Column Normalization, Proportion of Factor Levels within the Predictor Variable
create_heatmaps(crosstab_results, response='change', normalization='overall', n_cols=3, cmap="magma") # Crosstab Heatmap: Overall Normalization, Proportion of Total Data
- Histograms and boxplots of numerical variables showed differences in distribution for
time_in_hospitalandnum_medicationsacross response classes. - Categorical variables such as
insulin,diabetesMed,readmitted status, and various medication/testing variables displayed different distributions between groups, suggesting they could be useful predictors.
Separating the Explanatory and Response Variables¶
# Extract response variable
y_change = df_clean['change']
X_change = df_clean.drop(columns=['change'])
# Convert target variable to a binary numeric (1 for 'Ch', 0 for 'No')
y_binary = y_change.copy()
y_binary = np.where(y_change == 'Ch', 1, 0)
Encoding & Scaling¶
# Define categorical and numerical feature subsets
categorical_cols = X_change.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_change.select_dtypes(include=['int64', 'float64']).columns
# One-Hot Encoding categorical variables
X_change_encoded = pd.get_dummies(X_change, columns=categorical_cols, drop_first=True) # drop_first for multicollinearity issues
# Define the preprocessing pipeline
# Apply StandardScaler only to numerical columns while keeping categorical (one-hot encoded) features unchanged
preprocessor = ColumnTransformer([('num', StandardScaler(), numerical_cols)], remainder='passthrough')
- The response variable
changewas separated from the explanatory features. - Categorical variables were encoded, and scaling was applied to numerical variables for appropriate models.
Strategy for Performance Validation¶
- The dataset was split into training and holdout sets (80/20 split) while stratifying the response variable to maintain class balance.
- A stratified $k$-fold cross-validation (CV) approach was used:
- Feature selection and hyperparameter tuning were performed using 2-5 fold CV, depending on computational constraints.
- To speed up hyperparameter tuning, a 50% subsample of the training data was used.
- Final model comparison was conducted using 10-fold CV to obtain stable estimates of performance and generalization.
- The holdout test set (20%) was reserved for final model evaluation after tuning.
- Given the dataset size (~100,000 records), an 80/20 split was chosen to:
- Ensure sufficient samples for training and validation, while leaving enough data for an unbiased test set.
- Preserve representation across categories, ensuring the training set captures the variance in categorical features.
Creating a Stratified Holdout Test Set¶
# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_change_encoded, y_binary, test_size=0.2, random_state=1234)
Feature Engineering & Selection¶
# ----- LASSO Feature Selection -----
start_time = time.time()
lasso_selector = LassoCV(cv=5, random_state=1234)
# Create a pipeline with preprocessing and LASSO
lasso_pipeline = Pipeline([
('preprocessor', preprocessor),
('clf', lasso_selector)
])
# Fit the pipeline
lasso_pipeline.fit(X_train, y_train)
# Retrieve feature names after transformation
feature_names = numerical_cols.tolist() + [col for col in X_train.columns if col not in numerical_cols]
# Select features with non-zero coefficients
lasso_selected_features = [feature_names[i] for i in range(len(feature_names)) if lasso_selector.coef_[i] != 0]
print(f"LASSO time: {(time.time() - start_time)/60:.2f} minutes")
# ----- Random Forest Feature Selection -----
start_time = time.time()
rf_selector = RandomForestRegressor(n_estimators=100, random_state=1234)
rf_selector.fit(X_train, y_train)
# Get feature importances and select top features
feature_importances = pd.Series(rf_selector.feature_importances_, index=X_train.columns)
rf_selected_features = feature_importances[feature_importances > np.percentile(feature_importances, 75)].index.tolist() # Top 25% features
print(f"RF time: {(time.time() - start_time)/60:.2f} minutes")
# - `lasso_selected_features` is the list of features selected by LASSO
# - `rf_selected_features` is the list of features selected by Random Forest
# - `X_change` is the original dataset before encoding
# - `categorical_cols` and `numerical_cols` are already defined
# Convert feature lists to DataFrames for processing
lasso_df = pd.DataFrame({'Feature': lasso_selected_features})
rf_df = pd.DataFrame({'Feature': rf_selected_features})
# ----- Extract Original Variable Names for LASSO -----
lasso_numeric = lasso_df['Feature'][lasso_df['Feature'].isin(numerical_cols)]
lasso_categorical = lasso_df['Feature'][~lasso_df['Feature'].isin(numerical_cols)]
lasso_original_categorical = lasso_categorical.str.replace(r'_[^_]+$', '', regex=True) # Remove one-hot encoding suffixes
# Combine numeric and cleaned categorical features
lasso_unique_features = pd.concat([lasso_numeric, lasso_original_categorical]).unique()
# ----- Extract Original Variable Names for Random Forest -----
rf_numeric = rf_df['Feature'][rf_df['Feature'].isin(numerical_cols)]
rf_categorical = rf_df['Feature'][~rf_df['Feature'].isin(numerical_cols)]
rf_original_categorical = rf_categorical.str.replace(r'_[^_]+$', '', regex=True)
# Combine numeric and cleaned categorical features
rf_unique_features = pd.concat([rf_numeric, rf_original_categorical]).unique()
# ----- Find Common & Unused Features -----
dataset_features = pd.Series(X_change.columns) # Original dataset BEFORE encoding
# Features selected by both LASSO & RF
common_features = pd.Series(list(set(lasso_unique_features) & set(rf_unique_features)))
# Features in dataset but not selected by either method
unused_features = dataset_features[~dataset_features.isin(pd.concat([pd.Series(lasso_unique_features), pd.Series(rf_unique_features)]))]
# Print Results
print("\nLASSO Unique Selected Features (Original Variables):")
print(lasso_unique_features)
print("\nRandom Forest Unique Selected Features (Original Variables):")
print(rf_unique_features)
print("\nCommon Features Selected by Both LASSO and RF:")
print(common_features.to_list())
print("\nUnused Features (Present in Dataset but NOT Selected by LASSO or RF):")
print(unused_features.to_list())
LASSO time: 0.36 minutes RF time: 3.88 minutes LASSO Unique Selected Features (Original Variables): ['time_in_hospital' 'num_lab_procedures' 'num_procedures' 'num_medications' 'number_outpatient' 'number_emergency' 'number_inpatient' 'number_diagnoses' 'race' 'gender' 'age' 'admission_type_id' 'discharge_disposition_id' 'admission_source_id' 'payer_code' 'medical_specialty' 'diag_1' 'diag_2' 'diag_3' 'max_glu_serum' 'A1Cresult' 'metformin' 'repaglinide' 'nateglinide' 'chlorpropamide' 'glimepiride' 'glipizide' 'glyburide' 'pioglitazone' 'rosiglitazone' 'acarbose' 'insulin' 'glyburide-metformin' 'diabetesMed' 'readmitted'] Random Forest Unique Selected Features (Original Variables): ['time_in_hospital' 'num_lab_procedures' 'num_procedures' 'num_medications' 'number_outpatient' 'number_emergency' 'number_inpatient' 'number_diagnoses' 'race' 'gender' 'age' 'admission_type_id' 'discharge_disposition_id' 'payer_code' 'medical_specialty' 'diag_1' 'diag_2' 'diag_3' 'metformin' 'repaglinide' 'nateglinide' 'chlorpropamide' 'glimepiride' 'acetohexamide' 'glipizide' 'glyburide' 'tolbutamide' 'pioglitazone' 'rosiglitazone' 'acarbose' 'miglitol' 'troglitazone' 'tolazamide' 'insulin' 'glyburide-metformin' 'glipizide-metformin' 'glimepiride-pioglitazone' 'diabetesMed' 'readmitted'] Common Features Selected by Both LASSO and RF: ['insulin', 'number_outpatient', 'repaglinide', 'discharge_disposition_id', 'diag_2', 'rosiglitazone', 'glyburide', 'nateglinide', 'diag_1', 'num_procedures', 'diabetesMed', 'gender', 'race', 'num_lab_procedures', 'admission_type_id', 'payer_code', 'acarbose', 'glimepiride', 'pioglitazone', 'number_diagnoses', 'number_emergency', 'glipizide', 'number_inpatient', 'chlorpropamide', 'glyburide-metformin', 'time_in_hospital', 'medical_specialty', 'readmitted', 'age', 'metformin', 'num_medications', 'diag_3'] Unused Features (Present in Dataset but NOT Selected by LASSO or RF): ['metformin-rosiglitazone', 'metformin-pioglitazone']
Creating a Reduced Dataset with Important Variables (X_train_selected and X_test_selected)¶
# Train/test split was already created. Do this to avoid leakage.
# Identify encoded features corresponding to original features selected by both LASSO and RF
selected_encoded_features = [col for col in X_train.columns if any(feature in col for feature in common_features)]
# Create a Reduced Dataset with Encoded Features
X_train_selected = X_train[selected_encoded_features].copy()
X_test_selected = X_test[selected_encoded_features].copy() # Ensure same columns in test set
# Ensure train and test have the same columns (Test set may lack some categories)
X_test_selected = X_test_selected.reindex(columns=X_train_selected.columns, fill_value=0)
# Final check
print("Final Training Shape:", X_train_selected.shape)
print("Final Testing Shape:", X_test_selected.shape)
Final Training Shape: (81412, 2351) Final Testing Shape: (20354, 2351)
Creating a Second Dataset Filtering to Fewer Variables (X_train_super_reduced and X_test_super_reduced)¶
# Reducing the dataset further just to see...
# Because accuracy is so high, maybe we don't need all of these features and based on EDA we can drop several.
# Super Reduced Dataset
# Only key medication & two numerical features
# super_reduced_features = ['glyburide-metformin', 'glyburide', 'acarbose', 'rosiglitazone', 'pioglitazone', 'repaglinide',
# 'metformin', 'nateglinide', 'diabetesMed', 'glimepiride', 'glipizide', 'chlorpropamide',
# 'insulin', 'num_medications', 'time_in_hospital']
# Only key medication features
# super_reduced_features = ['glyburide-metformin', 'glyburide', 'acarbose', 'rosiglitazone', 'pioglitazone', 'repaglinide',
# 'metformin', 'nateglinide', 'glimepiride', 'glipizide', 'chlorpropamide',
# 'insulin']
# Medication variables with the largest variation between levels (top level < 90%)
# super_reduced_features = ['glyburide', 'metformin', 'glipizide', 'insulin']
# These 4 give these results:
# Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9295, AUC: 0.9571, F1: 0.9175
# Medication variables with most frequent levels < 95%)
# super_reduced_features = ['glyburide', 'metformin', 'glipizide', 'insulin', 'glimepiride', 'rosiglitazone', 'pioglitazone']
# These 7 give these results:
# Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9891, AUC: 0.9941, F1: 0.9881
super_reduced_features = ['glyburide', 'metformin', 'glipizide', 'insulin', 'glimepiride', 'rosiglitazone', 'pioglitazone', 'repaglinide']
super_reduced_encoded_features = [col for col in X_train.columns if any(feature in col for feature in super_reduced_features)]
# Create the super reduced dataset
X_train_super_reduced = X_train[super_reduced_encoded_features].copy()
X_test_super_reduced = X_test[super_reduced_encoded_features].copy()
X_test_super_reduced = X_test_super_reduced.reindex(columns=X_train_super_reduced.columns, fill_value=0)
print("Super Reduced Training Shape:", X_train_super_reduced.shape)
print("Super Reduced Testing Shape:", X_test_super_reduced.shape)
# In case we want to add numerical variables
# Only keep numerical columns that exist in the super reduced dataset
numerical_cols_reduced = [col for col in numerical_cols if col in X_train_super_reduced.columns]
# Update the preprocessor with the new numerical columns
preprocessor_reduced = ColumnTransformer([('num', StandardScaler(), numerical_cols_reduced)], remainder='passthrough')
Super Reduced Training Shape: (81412, 39) Super Reduced Testing Shape: (20354, 39)
- An initial feature selection process reduced the dataset to the most important variables.
- A further-reduced dataset was created, retaining only 8 highly relevant medication-related features.
- Modeling and evaluation were performed with both sets.
Modeling & Evaluation: Classification Task¶
Define Classifiers¶
We tested four classifiers:
- SGD Logistic Regression
- Selected for interpretability and computational efficiency when applied to large datasets.
- SGD Support Vector Machine (SVM)
- Chosen as a similar alternative to Logistic Regression with potentially better decision boundaries.
- Multinomial Naive Bayes
- Selected for computational efficiency due to its probabalistic nature. A multinomial version was selected because the data are neither all Gaussian or binary.
- AdaBoost Decision Tree
- Included for its ensemble-based flexibility and ability to improve weak learners.
# Define Classifiers
clf_lr = SGDClassifier(loss="log_loss", penalty="l2", alpha=1e-05, eta0=0.01,
max_iter=500, class_weight="balanced",
learning_rate="adaptive", n_jobs=2, random_state=1234)
clf_svm = SGDClassifier(loss="hinge", penalty="l2", # Hinge loss for SVM
max_iter=500, class_weight="balanced",
n_jobs=-1, random_state=1234)
clf_nb = MultinomialNB(alpha=0.5) # Tune alpha later
clf_dt = AdaBoostClassifier(estimator=DecisionTreeClassifier(criterion='entropy', max_depth=1, class_weight='balanced'),
learning_rate=0.5, # Tune small to prevent overfitting and larger to increase contribution of each classifier
n_estimators=200) # Tune estimators later
# Define classifier labels
clf_labels = ['SGD Logistic Regression', 'SGD Support Vector Machine', 'Multinomial Naive Bayes', 'AdaBoost Decision Tree']
# Pipelines
clf_lr_pipeline = Pipeline([('preprocessor', preprocessor), ('clf', clf_lr)])
clf_svm_pipeline = Pipeline([('preprocessor', preprocessor), ('clf', clf_svm)])
# clf_nb # No scaling needed
clf_dt_pipeline = Pipeline([('preprocessor', preprocessor), ('clf', clf_dt)])
Tune Hyperparameters¶
To equally prioritize predicting patients both with and without a change in medication, we chose to use overall accuracy to tune our models.
For each model, hyperparameters were tuned to optimize performance. Key parameters included:
- SGD Logistic Regression
penalty: L1 (Lasso) vs. L2 (Ridge) regularization to prevent overfitting.alpha: Regularization strength (penalizes large coefficients; small values allow for less regularization, capturing more complex patterns; larger values result in stronger regularization, which prevents overfitting).eta0: Initial learning rate only when using an adaptive learning schedule (if too small, model learns slowly needs more iterations to converge; if too large, updates may overshoot the optimal solution and will failt to converge).learning_rate: Tested betweenoptimal(predefined step-size schedule) andadaptive(adjusted dynamically based on updates).max_iter: Maximum number of iterations to allow the model to converge.
- SGD Support Vector Machine (SVM)
- Uses the same hyperparameters as SGD Logistic Regression, but it optimizes hinge loss instead of log loss.
- Multinomial Naive Bayes
alpha: Smoothing parameter (higher values avoid zero probabilities but can overly smooth the estimates).class_prior: Explicit priors were tested, in case of a benefit to artificially weighting classes in favor of detecting medication changes.
- AdaBoost Decision Tree
n_estimators: Number of boosting iterations (higher values increase complexity but risk overfitting and also slow down tuning).learning_rate: Controls how much each weak learner contributes to the final prediction (lower learning_rate, more weak learners needed, smaller updates; higher learning_rate, fewer weak learners needed, larger contribution, risk of overfitting).
# Tune on a subset of of X_train the training data
# Underscores are variables returned but not needed
X_train_tune, _, y_train_tune, _ = train_test_split(X_train, y_train, train_size=0.5, stratify=y_train, random_state=1234)
print(f"Original training set size: {X_train.shape[0]}")
print(f"Subset size for tuning: {X_train_tune.shape[0]}")
# Hyperparameter tuning for SGDClassifier (Logistic Regression with SGD)
start_time = time.time()
param_grid_lr = {
'clf__penalty': ['l1', 'l2'], #l1 is lasso
'clf__alpha': [1e-4, 1e-3, 1e-2],
'clf__eta0': [0.001, 0.01],
'clf__learning_rate': ['optimal', 'adaptive'],
'clf__max_iter': [500]
}
grid_lr = GridSearchCV(
clf_lr_pipeline, param_grid_lr, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True, n_jobs=-1
)
# grid_lr = HalvingGridSearchCV(
# clf_lr_pipeline, param_grid_lr, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000, # Uses at most ~half of training data
# return_train_score=True, n_jobs=-1
# )
grid_lr.fit(X_train_tune, y_train_tune)
# Print accuracy and AUC for each hyperparameter combination
print("\nLogistic Regression (SGD) Classifier - Hyperparameter Performance:")
for penalty, alpha, eta0, learning_rate, max_iter, acc, auc, f1 in zip(
grid_lr.cv_results_['param_clf__penalty'],
grid_lr.cv_results_['param_clf__alpha'],
grid_lr.cv_results_['param_clf__eta0'],
grid_lr.cv_results_['param_clf__learning_rate'],
grid_lr.cv_results_['param_clf__max_iter'],
grid_lr.cv_results_['mean_test_accuracy'],
grid_lr.cv_results_['mean_test_roc_auc'],
grid_lr.cv_results_['mean_test_f1']
):
print(f"Penalty: {penalty}, Alpha: {alpha}, eta0: {eta0}, Learning Rate: {learning_rate}, Max Iter: {max_iter}, "
f"Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best hyperparameters
best_penalty = grid_lr.best_params_['clf__penalty']
best_alpha = grid_lr.best_params_['clf__alpha']
best_eta0 = grid_lr.best_params_['clf__eta0']
best_learning_rate = grid_lr.best_params_['clf__learning_rate']
best_max_iter = grid_lr.best_params_['clf__max_iter']
print(f"\nBest hyperparameters for Logistic Regression (SGD) Classifier - Penalty: {best_penalty}, Alpha: {best_alpha}, eta0: {best_eta0}, "
f"Learning Rate: {best_learning_rate}, Max Iter: {best_max_iter}")
# Update SGDClassifier with best hyperparameters
clf_lr_pipeline.set_params(clf__penalty=best_penalty)
clf_lr_pipeline.set_params(clf__alpha=best_alpha)
clf_lr_pipeline.set_params(clf__eta0=best_eta0)
clf_lr_pipeline.set_params(clf__learning_rate=best_learning_rate)
clf_lr_pipeline.set_params(clf__max_iter=best_max_iter)
print(f"LR Classifier Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
# Hyperparameter tuning for SGDClassifier (Support Vector Machine with SGD)
start_time = time.time()
param_grid_svm = {
'clf__penalty': ['l1', 'l2'],
'clf__alpha': [1e-5, 1e-4, 1e-3],
'clf__eta0': [0.001, 0.01, 0.1],
'clf__learning_rate': ['optimal', 'adaptive'],
'clf__max_iter': [500]
}
grid_svm = GridSearchCV(
clf_svm_pipeline, param_grid_svm, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True, n_jobs=-1
)
# grid_svm = HalvingGridSearchCV(
# clf_svm_pipeline, param_grid_svm, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000,
# return_train_score=True, n_jobs=-1
# )
grid_svm.fit(X_train_tune, y_train_tune)
# Print accuracy and AUC for each hyperparameter combination
print("\nSVM (SGD) Classifier - Hyperparameter Performance:")
for penalty, alpha, eta0, learning_rate, max_iter, acc, auc, f1 in zip(
grid_svm.cv_results_['param_clf__penalty'],
grid_svm.cv_results_['param_clf__alpha'],
grid_svm.cv_results_['param_clf__eta0'],
grid_svm.cv_results_['param_clf__learning_rate'],
grid_svm.cv_results_['param_clf__max_iter'],
grid_svm.cv_results_['mean_test_accuracy'],
grid_svm.cv_results_['mean_test_roc_auc'],
grid_svm.cv_results_['mean_test_f1']
):
print(f"Penalty: {penalty}, Alpha: {alpha}, eta0: {eta0}, Learning Rate: {learning_rate}, Max Iter: {max_iter}, "
f"Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best hyperparameters
best_penalty = grid_svm.best_params_['clf__penalty']
best_alpha = grid_svm.best_params_['clf__alpha']
best_eta0 = grid_svm.best_params_['clf__eta0']
best_learning_rate = grid_svm.best_params_['clf__learning_rate']
best_max_iter = grid_svm.best_params_['clf__max_iter']
print(f"\nBest hyperparameters for SVM (SGD) Classifier - Penalty: {best_penalty}, Alpha: {best_alpha}, eta0: {best_eta0}, "
f"Learning Rate: {best_learning_rate}, Max Iter: {best_max_iter}")
# Update SGDClassifier with best hyperparameters
clf_svm_pipeline.set_params(clf__penalty=best_penalty)
clf_svm_pipeline.set_params(clf__alpha=best_alpha)
clf_svm_pipeline.set_params(clf__eta0=best_eta0)
clf_svm_pipeline.set_params(clf__learning_rate=best_learning_rate)
clf_svm_pipeline.set_params(clf__max_iter=best_max_iter)
print(f"SVM Classifier Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
# Hyperparameter tuning for MultinomialNB
start_time = time.time()
param_grid_nb = {
'alpha': np.arange(0.1, 1.1, 0.1),
'class_prior': [(0.3, 0.7), (0.4, 0.6), (0.46, 0.54), (0.5, 0.5), (0.6, 0.4), (0.7, 0.3)] # Yes=46%
}
grid_nb = GridSearchCV(
clf_nb, param_grid_nb, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True
)
# grid_nb = HalvingGridSearchCV(
# clf_nb, param_grid_nb, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000,
# return_train_score=True
# )
grid_nb.fit(X_train_tune, y_train_tune)
# Print accuracy and recall for each hyperparameter combination
print("\nMultinomialNB - Hyperparameter Performance:")
for alpha, class_prior, acc, auc, f1 in zip(
grid_nb.cv_results_['param_alpha'],
grid_nb.cv_results_['param_class_prior'],
grid_nb.cv_results_['mean_test_accuracy'],
grid_nb.cv_results_['mean_test_roc_auc'],
grid_nb.cv_results_['mean_test_f1']
):
print(f"Alpha: {alpha}, Class Prior: {class_prior}, Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best alpha and class_prior
best_alpha = grid_nb.best_params_['alpha']
best_class_prior = grid_nb.best_params_['class_prior']
print(f"\nBest alpha for MNB: {best_alpha}, Best class prior: {best_class_prior}")
# Update NB classifier with best hyperparameters
clf_nb = MultinomialNB(alpha=best_alpha, class_prior=best_class_prior)
print(f"NB Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
# Hyperparameter tuning for AdaBoost
start_time = time.time()
param_grid_ab = {'clf__n_estimators': [50, 100, 200],
'clf__learning_rate': [0.1, 0.5, 1.0, 1.5]}
grid_ab = GridSearchCV(
clf_dt_pipeline, param_grid_ab, cv=2, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True
)
# grid_ab = HalvingGridSearchCV(
# clf_dt_pipeline, param_grid_ab, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000,
# return_train_score=True
# )
grid_ab.fit(X_train_tune, y_train_tune)
# Print accuracy and recall for each hyperparameter combination
print("\nAdaBoost - Hyperparameter Performance:")
for n_estimators, learning_rate, acc, auc, f1 in zip(
grid_ab.cv_results_['param_clf__n_estimators'],
grid_ab.cv_results_['param_clf__learning_rate'],
grid_ab.cv_results_['mean_test_accuracy'],
grid_ab.cv_results_['mean_test_roc_auc'],
grid_ab.cv_results_['mean_test_f1']
):
print(f"n_estimators: {n_estimators}, learning_rate: {learning_rate}, Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best n_estimators and learning_rate
best_n_estimators = grid_ab.best_params_['clf__n_estimators']
best_learning_rate = grid_ab.best_params_['clf__learning_rate']
print(f"\nBest n_estimators for AdaBoost: {best_n_estimators}, Best learning_rate: {best_learning_rate}")
# Update AdaBoost classifier with best n_estimators
clf_dt_pipeline.set_params(clf__n_estimators=best_n_estimators)
clf_dt_pipeline.set_params(clf__learning_rate=best_learning_rate)
print(f"AdaBoost Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
Original training set size: 81412 Subset size for tuning: 40706 Logistic Regression (SGD) Classifier - Hyperparameter Performance: Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9975, AUC: 0.9984, F1: 0.9973 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9945, AUC: 0.9981, F1: 0.9940 Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9822, AUC: 0.9944, F1: 0.9804 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9821, AUC: 0.9938, F1: 0.9803 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9975, AUC: 0.9984, F1: 0.9973 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9945, AUC: 0.9981, F1: 0.9940 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9955, AUC: 0.9979, F1: 0.9951 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9929, AUC: 0.9974, F1: 0.9922 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9876, AUC: 0.9947, F1: 0.9865 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9782, AUC: 0.9909, F1: 0.9760 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9769, AUC: 0.9883, F1: 0.9744 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9765, AUC: 0.9882, F1: 0.9740 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9876, AUC: 0.9947, F1: 0.9865 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9782, AUC: 0.9909, F1: 0.9760 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9850, AUC: 0.9939, F1: 0.9835 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9794, AUC: 0.9910, F1: 0.9772 Penalty: l1, Alpha: 0.01, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.8894, AUC: 0.9543, F1: 0.8782 Penalty: l2, Alpha: 0.01, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.8980, AUC: 0.9672, F1: 0.8918 Penalty: l1, Alpha: 0.01, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.8855, AUC: 0.9535, F1: 0.8735 Penalty: l2, Alpha: 0.01, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.8984, AUC: 0.9668, F1: 0.8918 Penalty: l1, Alpha: 0.01, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.8894, AUC: 0.9543, F1: 0.8782 Penalty: l2, Alpha: 0.01, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.8980, AUC: 0.9672, F1: 0.8918 Penalty: l1, Alpha: 0.01, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.8894, AUC: 0.9551, F1: 0.8779 Penalty: l2, Alpha: 0.01, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.8998, AUC: 0.9672, F1: 0.8931 Best hyperparameters for Logistic Regression (SGD) Classifier - Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500 LR Classifier Tuning Time: 0.81 minutes SVM (SGD) Classifier - Hyperparameter Performance: Penalty: l1, Alpha: 1e-05, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9967, AUC: 0.9993, F1: 0.9964 Penalty: l2, Alpha: 1e-05, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9976, AUC: 0.9993, F1: 0.9974 Penalty: l1, Alpha: 1e-05, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9900, AUC: 0.9962, F1: 0.9891 Penalty: l2, Alpha: 1e-05, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9925, AUC: 0.9964, F1: 0.9918 Penalty: l1, Alpha: 1e-05, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9967, AUC: 0.9993, F1: 0.9964 Penalty: l2, Alpha: 1e-05, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9976, AUC: 0.9993, F1: 0.9974 Penalty: l1, Alpha: 1e-05, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9980, AUC: 0.9990, F1: 0.9978 Penalty: l2, Alpha: 1e-05, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9980, AUC: 0.9990, F1: 0.9979 Penalty: l1, Alpha: 1e-05, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9967, AUC: 0.9993, F1: 0.9964 Penalty: l2, Alpha: 1e-05, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9976, AUC: 0.9993, F1: 0.9974 Penalty: l1, Alpha: 1e-05, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9994, AUC: 0.9997, F1: 0.9993 Penalty: l2, Alpha: 1e-05, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9993, AUC: 0.9996, F1: 0.9993 Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9972, AUC: 0.9983, F1: 0.9969 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9983, AUC: 0.9992, F1: 0.9982 Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9879, AUC: 0.9961, F1: 0.9868 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9891, AUC: 0.9963, F1: 0.9880 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9972, AUC: 0.9983, F1: 0.9969 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9983, AUC: 0.9992, F1: 0.9982 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9974, AUC: 0.9987, F1: 0.9971 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9972, AUC: 0.9989, F1: 0.9969 Penalty: l1, Alpha: 0.0001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9972, AUC: 0.9983, F1: 0.9969 Penalty: l2, Alpha: 0.0001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9983, AUC: 0.9992, F1: 0.9982 Penalty: l1, Alpha: 0.0001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9991, AUC: 0.9995, F1: 0.9990 Penalty: l2, Alpha: 0.0001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9986, AUC: 0.9994, F1: 0.9985 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9931, AUC: 0.9952, F1: 0.9925 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9843, AUC: 0.9953, F1: 0.9827 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9800, AUC: 0.9933, F1: 0.9779 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9824, AUC: 0.9942, F1: 0.9806 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9931, AUC: 0.9952, F1: 0.9925 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9843, AUC: 0.9953, F1: 0.9827 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9935, AUC: 0.9947, F1: 0.9929 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9841, AUC: 0.9959, F1: 0.9825 Penalty: l1, Alpha: 0.001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9931, AUC: 0.9952, F1: 0.9925 Penalty: l2, Alpha: 0.001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9843, AUC: 0.9953, F1: 0.9827 Penalty: l1, Alpha: 0.001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9935, AUC: 0.9944, F1: 0.9929 Penalty: l2, Alpha: 0.001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9843, AUC: 0.9960, F1: 0.9827 Best hyperparameters for SVM (SGD) Classifier - Penalty: l1, Alpha: 1e-05, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500 SVM Classifier Tuning Time: 0.82 minutes MultinomialNB - Hyperparameter Performance: Alpha: 0.1, Class Prior: (0.3, 0.7), Accuracy: 0.7776, AUC: 0.9278, F1: 0.7953 Alpha: 0.1, Class Prior: (0.4, 0.6), Accuracy: 0.8099, AUC: 0.9278, F1: 0.8150 Alpha: 0.1, Class Prior: (0.46, 0.54), Accuracy: 0.8231, AUC: 0.9278, F1: 0.8225 Alpha: 0.1, Class Prior: (0.5, 0.5), Accuracy: 0.8298, AUC: 0.9278, F1: 0.8257 Alpha: 0.1, Class Prior: (0.6, 0.4), Accuracy: 0.8430, AUC: 0.9278, F1: 0.8316 Alpha: 0.1, Class Prior: (0.7, 0.3), Accuracy: 0.8498, AUC: 0.9278, F1: 0.8314 Alpha: 0.2, Class Prior: (0.3, 0.7), Accuracy: 0.7782, AUC: 0.9287, F1: 0.7958 Alpha: 0.2, Class Prior: (0.4, 0.6), Accuracy: 0.8107, AUC: 0.9287, F1: 0.8157 Alpha: 0.2, Class Prior: (0.46, 0.54), Accuracy: 0.8239, AUC: 0.9287, F1: 0.8232 Alpha: 0.2, Class Prior: (0.5, 0.5), Accuracy: 0.8307, AUC: 0.9287, F1: 0.8267 Alpha: 0.2, Class Prior: (0.6, 0.4), Accuracy: 0.8437, AUC: 0.9287, F1: 0.8322 Alpha: 0.2, Class Prior: (0.7, 0.3), Accuracy: 0.8506, AUC: 0.9287, F1: 0.8322 Alpha: 0.30000000000000004, Class Prior: (0.3, 0.7), Accuracy: 0.7784, AUC: 0.9292, F1: 0.7960 Alpha: 0.30000000000000004, Class Prior: (0.4, 0.6), Accuracy: 0.8112, AUC: 0.9292, F1: 0.8162 Alpha: 0.30000000000000004, Class Prior: (0.46, 0.54), Accuracy: 0.8241, AUC: 0.9292, F1: 0.8235 Alpha: 0.30000000000000004, Class Prior: (0.5, 0.5), Accuracy: 0.8310, AUC: 0.9292, F1: 0.8270 Alpha: 0.30000000000000004, Class Prior: (0.6, 0.4), Accuracy: 0.8440, AUC: 0.9292, F1: 0.8325 Alpha: 0.30000000000000004, Class Prior: (0.7, 0.3), Accuracy: 0.8513, AUC: 0.9292, F1: 0.8328 Alpha: 0.4, Class Prior: (0.3, 0.7), Accuracy: 0.7786, AUC: 0.9295, F1: 0.7963 Alpha: 0.4, Class Prior: (0.4, 0.6), Accuracy: 0.8113, AUC: 0.9295, F1: 0.8164 Alpha: 0.4, Class Prior: (0.46, 0.54), Accuracy: 0.8244, AUC: 0.9295, F1: 0.8237 Alpha: 0.4, Class Prior: (0.5, 0.5), Accuracy: 0.8316, AUC: 0.9295, F1: 0.8275 Alpha: 0.4, Class Prior: (0.6, 0.4), Accuracy: 0.8447, AUC: 0.9295, F1: 0.8333 Alpha: 0.4, Class Prior: (0.7, 0.3), Accuracy: 0.8514, AUC: 0.9295, F1: 0.8329 Alpha: 0.5, Class Prior: (0.3, 0.7), Accuracy: 0.7786, AUC: 0.9297, F1: 0.7963 Alpha: 0.5, Class Prior: (0.4, 0.6), Accuracy: 0.8114, AUC: 0.9297, F1: 0.8165 Alpha: 0.5, Class Prior: (0.46, 0.54), Accuracy: 0.8246, AUC: 0.9297, F1: 0.8240 Alpha: 0.5, Class Prior: (0.5, 0.5), Accuracy: 0.8318, AUC: 0.9297, F1: 0.8278 Alpha: 0.5, Class Prior: (0.6, 0.4), Accuracy: 0.8452, AUC: 0.9297, F1: 0.8337 Alpha: 0.5, Class Prior: (0.7, 0.3), Accuracy: 0.8517, AUC: 0.9297, F1: 0.8331 Alpha: 0.6000000000000001, Class Prior: (0.3, 0.7), Accuracy: 0.7785, AUC: 0.9298, F1: 0.7963 Alpha: 0.6000000000000001, Class Prior: (0.4, 0.6), Accuracy: 0.8117, AUC: 0.9298, F1: 0.8168 Alpha: 0.6000000000000001, Class Prior: (0.46, 0.54), Accuracy: 0.8248, AUC: 0.9298, F1: 0.8242 Alpha: 0.6000000000000001, Class Prior: (0.5, 0.5), Accuracy: 0.8319, AUC: 0.9298, F1: 0.8279 Alpha: 0.6000000000000001, Class Prior: (0.6, 0.4), Accuracy: 0.8454, AUC: 0.9298, F1: 0.8340 Alpha: 0.6000000000000001, Class Prior: (0.7, 0.3), Accuracy: 0.8518, AUC: 0.9298, F1: 0.8332 Alpha: 0.7000000000000001, Class Prior: (0.3, 0.7), Accuracy: 0.7784, AUC: 0.9299, F1: 0.7963 Alpha: 0.7000000000000001, Class Prior: (0.4, 0.6), Accuracy: 0.8119, AUC: 0.9299, F1: 0.8170 Alpha: 0.7000000000000001, Class Prior: (0.46, 0.54), Accuracy: 0.8248, AUC: 0.9299, F1: 0.8241 Alpha: 0.7000000000000001, Class Prior: (0.5, 0.5), Accuracy: 0.8324, AUC: 0.9299, F1: 0.8283 Alpha: 0.7000000000000001, Class Prior: (0.6, 0.4), Accuracy: 0.8455, AUC: 0.9299, F1: 0.8340 Alpha: 0.7000000000000001, Class Prior: (0.7, 0.3), Accuracy: 0.8520, AUC: 0.9299, F1: 0.8334 Alpha: 0.8, Class Prior: (0.3, 0.7), Accuracy: 0.7785, AUC: 0.9299, F1: 0.7963 Alpha: 0.8, Class Prior: (0.4, 0.6), Accuracy: 0.8119, AUC: 0.9299, F1: 0.8170 Alpha: 0.8, Class Prior: (0.46, 0.54), Accuracy: 0.8249, AUC: 0.9299, F1: 0.8242 Alpha: 0.8, Class Prior: (0.5, 0.5), Accuracy: 0.8324, AUC: 0.9299, F1: 0.8283 Alpha: 0.8, Class Prior: (0.6, 0.4), Accuracy: 0.8455, AUC: 0.9299, F1: 0.8341 Alpha: 0.8, Class Prior: (0.7, 0.3), Accuracy: 0.8521, AUC: 0.9299, F1: 0.8334 Alpha: 0.9, Class Prior: (0.3, 0.7), Accuracy: 0.7785, AUC: 0.9300, F1: 0.7963 Alpha: 0.9, Class Prior: (0.4, 0.6), Accuracy: 0.8116, AUC: 0.9300, F1: 0.8167 Alpha: 0.9, Class Prior: (0.46, 0.54), Accuracy: 0.8254, AUC: 0.9300, F1: 0.8247 Alpha: 0.9, Class Prior: (0.5, 0.5), Accuracy: 0.8325, AUC: 0.9300, F1: 0.8285 Alpha: 0.9, Class Prior: (0.6, 0.4), Accuracy: 0.8456, AUC: 0.9300, F1: 0.8342 Alpha: 0.9, Class Prior: (0.7, 0.3), Accuracy: 0.8521, AUC: 0.9300, F1: 0.8334 Alpha: 1.0, Class Prior: (0.3, 0.7), Accuracy: 0.7785, AUC: 0.9300, F1: 0.7964 Alpha: 1.0, Class Prior: (0.4, 0.6), Accuracy: 0.8116, AUC: 0.9300, F1: 0.8167 Alpha: 1.0, Class Prior: (0.46, 0.54), Accuracy: 0.8256, AUC: 0.9300, F1: 0.8248 Alpha: 1.0, Class Prior: (0.5, 0.5), Accuracy: 0.8325, AUC: 0.9300, F1: 0.8284 Alpha: 1.0, Class Prior: (0.6, 0.4), Accuracy: 0.8455, AUC: 0.9300, F1: 0.8340 Alpha: 1.0, Class Prior: (0.7, 0.3), Accuracy: 0.8522, AUC: 0.9300, F1: 0.8334 Best alpha for MNB: 1.0, Best class prior: (0.7, 0.3) NB Tuning Time: 2.59 minutes AdaBoost - Hyperparameter Performance: n_estimators: 50, learning_rate: 0.1, Accuracy: 0.6906, AUC: 0.8565, F1: 0.7494 n_estimators: 100, learning_rate: 0.1, Accuracy: 0.7694, AUC: 0.8565, F1: 0.6678 n_estimators: 200, learning_rate: 0.1, Accuracy: 0.7694, AUC: 0.8787, F1: 0.6678 n_estimators: 50, learning_rate: 0.5, Accuracy: 0.7999, AUC: 0.8787, F1: 0.7319 n_estimators: 100, learning_rate: 0.5, Accuracy: 0.7999, AUC: 0.8787, F1: 0.7319 n_estimators: 200, learning_rate: 0.5, Accuracy: 0.7999, AUC: 0.8787, F1: 0.7319 n_estimators: 50, learning_rate: 1.0, Accuracy: 0.8343, AUC: 0.9032, F1: 0.8025 n_estimators: 100, learning_rate: 1.0, Accuracy: 0.8343, AUC: 0.9032, F1: 0.8025 n_estimators: 200, learning_rate: 1.0, Accuracy: 0.8343, AUC: 0.9032, F1: 0.8025 n_estimators: 50, learning_rate: 1.5, Accuracy: 0.8787, AUC: 0.9370, F1: 0.8597 n_estimators: 100, learning_rate: 1.5, Accuracy: 0.8787, AUC: 0.9370, F1: 0.8597 n_estimators: 200, learning_rate: 1.5, Accuracy: 0.8787, AUC: 0.9370, F1: 0.8597 Best n_estimators for AdaBoost: 50, Best learning_rate: 1.5 AdaBoost Tuning Time: 7.21 minutes
Cross-Validate Classifiers (SGD Logistic Regression, SGD Support Vector Machine, Multinomial Naive Bayes, and AdaBoost Decision Tree)¶
start_time_total = time.time() # Track total time
# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
# Dictionary to store model training times and accuracy scores
cv_times = {}
cv_accuracy_scores = {} # Store accuracy scores for each model
# Cross-validation: Recall, Accuracy, F1
print("\n10-fold Cross-Validation Metrics:\n")
for clf, label in zip([clf_lr_pipeline, clf_svm_pipeline, clf_nb, clf_dt_pipeline], clf_labels):
start_time = time.time() # Start timing for the model
# Compute and store accuracy scores
accuracy_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=skf, scoring='accuracy', n_jobs=-1)
cv_accuracy_scores[label] = accuracy_scores # Store for later use
# Compute and display AUC and F1 scores
roc_auc_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=skf, scoring='roc_auc', n_jobs=-1)
f1_scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=skf, scoring='f1', n_jobs=-1)
end_time = time.time() # End timing
# Store elapsed time in minutes
cv_times[label] = (end_time - start_time) / 60
print(f"{label}:")
print(f" CV Accuracy: {accuracy_scores.mean():.3f} (+/- {accuracy_scores.std():.3f})")
print(f" CV AUC: {roc_auc_scores.mean():.3f} (+/- {roc_auc_scores.std():.3f})")
print(f" CV F1 Score: {f1_scores.mean():.3f} (+/- {f1_scores.std():.3f})\n")
# Print Cross-Validation Timing Per Model
print("\nCross-Validation Time per Model (in minutes):")
for model, time_taken in cv_times.items():
print(f"{model}: {time_taken:.2f} minutes")
# Print Total Time Taken
end_time_total = time.time()
print(f"\nTotal Elapsed Time for Cross-Validation: {(end_time_total - start_time_total)/60:.2f} minutes")
10-fold Cross-Validation Metrics: SGD Logistic Regression: CV Accuracy: 0.998 (+/- 0.000) CV AUC: 0.999 (+/- 0.000) CV F1 Score: 0.998 (+/- 0.000) SGD Support Vector Machine: CV Accuracy: 1.000 (+/- 0.000) CV AUC: 1.000 (+/- 0.000) CV F1 Score: 1.000 (+/- 0.000) Multinomial Naive Bayes: CV Accuracy: 0.853 (+/- 0.004) CV AUC: 0.932 (+/- 0.002) CV F1 Score: 0.834 (+/- 0.004) AdaBoost Decision Tree: CV Accuracy: 0.833 (+/- 0.003) CV AUC: 0.903 (+/- 0.002) CV F1 Score: 0.801 (+/- 0.004) Cross-Validation Time per Model (in minutes): SGD Logistic Regression: 1.81 minutes SGD Support Vector Machine: 5.92 minutes Multinomial Naive Bayes: 0.31 minutes AdaBoost Decision Tree: 2.29 minutes Total Elapsed Time for Cross-Validation: 10.32 minutes
Statistical Comparison of Models¶
To determine whether the performance differences were statistically significant, we performed paired t-tests with a Bonferroni correction and Tukey’s HSD Test.
# Extract accuracy scores from dictionary
lr_acc = cv_accuracy_scores["SGD Logistic Regression"]
svm_acc = cv_accuracy_scores["SGD Support Vector Machine"]
nb_acc = cv_accuracy_scores["Multinomial Naive Bayes"]
dt_acc = cv_accuracy_scores["AdaBoost Decision Tree"]
# Define all model pairs for pairwise comparisons
model_pairs = [
("SGD Logistic Regression", "SGD Support Vector Machine", lr_acc, svm_acc),
("SGD Logistic Regression", "Multinomial Naive Bayes", lr_acc, nb_acc),
("SGD Logistic Regression", "AdaBoost Decision Tree", lr_acc, dt_acc),
("SGD Support Vector Machine", "Multinomial Naive Bayes", svm_acc, nb_acc),
("SGD Support Vector Machine", "AdaBoost Decision Tree", svm_acc, dt_acc),
("Multinomial Naive Bayes", "AdaBoost Decision Tree", nb_acc, dt_acc),
]
# Perform paired t-tests
p_values = []
t_stats = []
print("\nPairwise t-test results:")
for name1, name2, acc1, acc2 in model_pairs:
t_stat, p_val = ttest_rel(acc1, acc2)
t_stats.append(t_stat)
p_values.append(p_val)
print(f"{name1} vs {name2}: t = {t_stat:.3f}, p = {p_val:.5f}")
# Apply Bonferroni correction
_, p_corrected_bonferroni, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')
print("\nBonferroni-corrected p-values:")
for i, (name1, name2, _, _) in enumerate(model_pairs):
print(f"{name1} vs {name2}: p = {p_corrected_bonferroni[i]:.5f}")
# === Tukey's HSD Test ===
# Combine all accuracy scores into a single array
all_scores = np.concatenate([lr_acc, svm_acc, nb_acc, dt_acc])
# Create labels for each model (repeated for each fold)
model_labels = (["SGD Logistic Regression"] * len(lr_acc) +
["SGD Support Vector Machine"] * len(svm_acc) +
["Multinomial Naive Bayes"] * len(nb_acc) +
["AdaBoost Decision Tree"] * len(dt_acc))
# Perform Tukey's HSD test
tukey_results = pairwise_tukeyhsd(all_scores, model_labels, alpha=0.05)
print("\nTukey's HSD Test Results:")
print(tukey_results)
Pairwise t-test results:
SGD Logistic Regression vs SGD Support Vector Machine: t = -15.749, p = 0.00000
SGD Logistic Regression vs Multinomial Naive Bayes: t = 121.308, p = 0.00000
SGD Logistic Regression vs AdaBoost Decision Tree: t = 184.556, p = 0.00000
SGD Support Vector Machine vs Multinomial Naive Bayes: t = 121.484, p = 0.00000
SGD Support Vector Machine vs AdaBoost Decision Tree: t = 180.937, p = 0.00000
Multinomial Naive Bayes vs AdaBoost Decision Tree: t = 20.352, p = 0.00000
Bonferroni-corrected p-values:
SGD Logistic Regression vs SGD Support Vector Machine: p = 0.00000
SGD Logistic Regression vs Multinomial Naive Bayes: p = 0.00000
SGD Logistic Regression vs AdaBoost Decision Tree: p = 0.00000
SGD Support Vector Machine vs Multinomial Naive Bayes: p = 0.00000
SGD Support Vector Machine vs AdaBoost Decision Tree: p = 0.00000
Multinomial Naive Bayes vs AdaBoost Decision Tree: p = 0.00000
Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
========================================================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------------------------------------------
AdaBoost Decision Tree Multinomial Naive Bayes 0.0204 0.0 0.0175 0.0233 True
AdaBoost Decision Tree SGD Logistic Regression 0.1654 0.0 0.1625 0.1683 True
AdaBoost Decision Tree SGD Support Vector Machine 0.1671 0.0 0.1642 0.17 True
Multinomial Naive Bayes SGD Logistic Regression 0.145 0.0 0.1421 0.1479 True
Multinomial Naive Bayes SGD Support Vector Machine 0.1467 0.0 0.1438 0.1496 True
SGD Logistic Regression SGD Support Vector Machine 0.0017 0.4131 -0.0012 0.0046 False
----------------------------------------------------------------------------------------
Paired t-tests with Bonferroni correction
- Compared paired accuracy scores across cross-validation folds.
- Found a statistically significant difference between all models.
Tukey’s HSD Test
- Compared model accuracy as independent groups not as paired observations (folds).
- More appropriate for comparing mean accuracy of many models.
- Typically less conservative than the Bonferroni.
- Did not find a significant difference between SGD Logistic Regression and SVM.
- Confirmed Naive Bayes and AdaBoost were significantly different from the SGD-based models.
Though Logistic Regression vs. SVM was statistically significant using Bonferroni-corrected paired t-test, the difference was very small. Practically, Logistic Regression and SVM perform equivalently.
Evaluate Classifiers on the Holdout Set¶
# Fit the models
clf_lr_pipeline.fit(X_train, y_train)
clf_svm_pipeline.fit(X_train, y_train)
clf_nb.fit(X_train, y_train)
clf_dt_pipeline.fit(X_train, y_train)
# Create subplots for CM, ROC, and PR curves
fig, axes = plt.subplots(4, 3, figsize=(15, 20)) # 4 rows, 3 columns (CM, PR, ROC)
for i, (clf, label) in enumerate(zip([clf_lr_pipeline, clf_svm_pipeline, clf_nb, clf_dt_pipeline], clf_labels)):
# Predictions
y_test_pred = clf.predict(X_test)
# Use predict_proba() if available; otherwise, use decision_function()
if hasattr(clf, "predict_proba"):
y_test_prob = clf.predict_proba(X_test)[:, 1] # Probabilities for positive class
else:
y_test_prob = clf.decision_function(X_test) # Decision scores (not probabilities)
# Confusion Matrix
cm_test = mt.confusion_matrix(y_test, y_test_pred)
cm_display = mt.ConfusionMatrixDisplay(cm_test)
axes[i, 0].set_title(f"Confusion Matrix - {label}")
cm_display.plot(ax=axes[i, 0], cmap="Blues", colorbar=False)
# ROC Curve
fpr, tpr, _ = mt.roc_curve(y_test, y_test_prob)
axes[i, 1].plot(fpr, tpr, label=f"{label} (AUC={mt.roc_auc_score(y_test, y_test_prob):.2f})")
axes[i, 1].set_title("ROC Curve")
axes[i, 1].set_xlabel("False Positive Rate")
axes[i, 1].set_ylabel("True Positive Rate")
axes[i, 1].legend(loc="best")
# Precision-Recall Curve
precision, recall, _ = mt.precision_recall_curve(y_test, y_test_prob)
axes[i, 2].plot(recall, precision, label=f"{label}")
axes[i, 2].set_title("Precision-Recall Curve")
axes[i, 2].set_xlabel("Recall")
axes[i, 2].set_ylabel("Precision")
axes[i, 2].legend(loc="best")
# Print final metrics for comparison
print(f"\nPerformance Metrics for {label}:")
print(f" Accuracy: {mt.accuracy_score(y_test, y_test_pred):.4f}")
print(f" ROC AUC: {mt.roc_auc_score(y_test, y_test_prob):.4f}")
print(f" F1 Score: {mt.f1_score(y_test, y_test_pred):.4f}")
print(f" Precision: {mt.precision_score(y_test, y_test_pred):.4f}")
print(f" Recall: {mt.recall_score(y_test, y_test_pred):.4f}")
print("\n")
plt.tight_layout()
plt.show()
Performance Metrics for SGD Logistic Regression: Accuracy: 0.9979 ROC AUC: 0.9986 F1 Score: 0.9978 Precision: 0.9997 Recall: 0.9958 Performance Metrics for SGD Support Vector Machine: Accuracy: 0.9999 ROC AUC: 0.9999 F1 Score: 0.9999 Precision: 1.0000 Recall: 0.9998 Performance Metrics for Multinomial Naive Bayes: Accuracy: 0.8531 ROC AUC: 0.9321 F1 Score: 0.8345 Precision: 0.8661 Recall: 0.8052 Performance Metrics for AdaBoost Decision Tree: Accuracy: 0.8337 ROC AUC: 0.9029 F1 Score: 0.8020 Precision: 0.8865 Recall: 0.7322
Chosen metrics based on the fact that it is equally important to predict patients with a medication change and those without.
- Accuracy – Provides a general measure of correctness, which is appropriate given the balanced classes and the need for equal weight on both outcomes.
- ROC AUC – A threshold-independent metric that visualizes the trade-off between true positives and false positives across different classification thresholds, helping assess overall model discrimination.
- F1 Score – Balances precision and recall, ensuring that the model is not overly biased toward one class.
- Precision & Recall – Helps analyze the trade-offs between minimizing false positives (incorrectly predicted medication changes) and false negatives (missed medication changes).
Summary of the Plotted Evaluation Metrics (Full Feature Set)
- Confusion Matrix (Left): Shows correct and incorrect predictions.
- ROC Curve & AUC Score (Middle): Assesses the model’s ability to distinguish between classes across different thresholds.
- Precision-Recall (PR) Curve (Right): Visualizes the trade-off between precision and recall.
Takeaways:
- SGD Logistic Regression & SGD SVM
- Nearly perfect classification with almost no false positives or false negatives (Logistic Regression: 3/39, SVM: 0/2)
- ROC AUC = 1.00, meaning near perfect class separation.
- PR Curve is ideal, showing near-perfect recall and precision.
- Near perfect result poses the question, is this real?
- Multinomial Naive Bayes (Third Model)
- More false negatives (1,824) and false positives (1,165).
- ROC AUC = 0.93, good but weaker class separation than the SGD models.
- AdaBoost Decision Tree (Bottom Model)
- More false negatives (2,507) and false positives (878) than SGD models and a different balance than Naive Bayes
- ROC AUC = 0.90, still good but slightly weaker than Naive Bayes.
Tuning on Dataset with Even Fewer Features¶
# Tune on a subset of of X_train the training data
# Underscores are variables returned but not needed
X_train_super_reduced_tune, _, y_train_tune, _ = train_test_split(X_train_super_reduced, y_train, train_size=0.5, stratify=y_train, random_state=1234)
print(f"Original training set size: {X_train.shape[0]}")
print(f"Subset size for tuning: {X_train_tune.shape[0]}")
print('Results for Super Reduced Dataset')
# Pipelines
clf_lr_pipeline_red = Pipeline([('preprocessor', preprocessor_reduced), ('clf', clf_lr)])
clf_svm_pipeline_red = Pipeline([('preprocessor', preprocessor_reduced), ('clf', clf_svm)])
# clf_nb # No scaling needed
clf_dt_pipeline_red = Pipeline([('preprocessor', preprocessor_reduced), ('clf', clf_dt)])
# Hyperparameter tuning for SGDClassifier (Logistic Regression with SGD)
start_time = time.time()
param_grid_lr = {
'clf__penalty': ['l1', 'l2'], #l1 is lasso
'clf__alpha': [1e-4, 1e-3, 1e-2],
'clf__eta0': [0.001, 0.01],
'clf__learning_rate': ['optimal', 'adaptive'],
'clf__max_iter': [500]
}
grid_lr = GridSearchCV(
clf_lr_pipeline_red, param_grid_lr, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True, n_jobs=-1
)
# grid_lr = HalvingGridSearchCV(
# clf_lr_pipeline_red, param_grid_lr, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000, # Uses at most ~half of training data
# return_train_score=True, n_jobs=-1
# )
grid_lr.fit(X_train_super_reduced_tune, y_train_tune)
# Print accuracy and AUC for each hyperparameter combination
print("\nLogistic Regression (SGD) Classifier - Hyperparameter Performance:")
for penalty, alpha, eta0, learning_rate, max_iter, acc, auc, f1 in zip(
grid_lr.cv_results_['param_clf__penalty'],
grid_lr.cv_results_['param_clf__alpha'],
grid_lr.cv_results_['param_clf__eta0'],
grid_lr.cv_results_['param_clf__learning_rate'],
grid_lr.cv_results_['param_clf__max_iter'],
grid_lr.cv_results_['mean_test_accuracy'],
grid_lr.cv_results_['mean_test_roc_auc'],
grid_lr.cv_results_['mean_test_f1']
):
print(f"Penalty: {penalty}, Alpha: {alpha}, eta0: {eta0}, Learning Rate: {learning_rate}, Max Iter: {max_iter}, "
f"Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best hyperparameters
best_penalty = grid_lr.best_params_['clf__penalty']
best_alpha = grid_lr.best_params_['clf__alpha']
best_eta0 = grid_lr.best_params_['clf__eta0']
best_learning_rate = grid_lr.best_params_['clf__learning_rate']
best_max_iter = grid_lr.best_params_['clf__max_iter']
print(f"\nBest hyperparameters for Logistic Regression (SGD) Classifier - Penalty: {best_penalty}, Alpha: {best_alpha}, eta0: {best_eta0}, "
f"Learning Rate: {best_learning_rate}, Max Iter: {best_max_iter}")
# Update SGDClassifier with best hyperparameters
clf_lr_pipeline_red.set_params(clf__penalty=best_penalty)
clf_lr_pipeline_red.set_params(clf__alpha=best_alpha)
clf_lr_pipeline_red.set_params(clf__eta0=best_eta0)
clf_lr_pipeline_red.set_params(clf__learning_rate=best_learning_rate)
clf_lr_pipeline_red.set_params(clf__max_iter=best_max_iter)
print(f"LR Classifier Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
# Hyperparameter tuning for SGDClassifier (Support Vector Machine with SGD)
start_time = time.time()
param_grid_svm = {
'clf__penalty': ['l1', 'l2'],
'clf__alpha': [1e-5, 1e-4, 1e-3],
'clf__eta0': [0.001, 0.01, 0.1],
'clf__learning_rate': ['optimal', 'adaptive'],
'clf__max_iter': [500]
}
grid_svm = GridSearchCV(
clf_svm_pipeline_red, param_grid_svm, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True, n_jobs=-1
)
# grid_svm = HalvingGridSearchCV(
# clf_svm_pipeline_red, param_grid_svm, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000,
# return_train_score=True, n_jobs=-1
# )
grid_svm.fit(X_train_tune, y_train_tune)
# Print accuracy and AUC for each hyperparameter combination
print("\nSVM (SGD) Classifier - Hyperparameter Performance:")
for penalty, alpha, eta0, learning_rate, max_iter, acc, auc, f1 in zip(
grid_svm.cv_results_['param_clf__penalty'],
grid_svm.cv_results_['param_clf__alpha'],
grid_svm.cv_results_['param_clf__eta0'],
grid_svm.cv_results_['param_clf__learning_rate'],
grid_svm.cv_results_['param_clf__max_iter'],
grid_svm.cv_results_['mean_test_accuracy'],
grid_svm.cv_results_['mean_test_roc_auc'],
grid_svm.cv_results_['mean_test_f1']
):
print(f"Penalty: {penalty}, Alpha: {alpha}, eta0: {eta0}, Learning Rate: {learning_rate}, Max Iter: {max_iter}, "
f"Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best hyperparameters
best_penalty = grid_svm.best_params_['clf__penalty']
best_alpha = grid_svm.best_params_['clf__alpha']
best_eta0 = grid_svm.best_params_['clf__eta0']
best_learning_rate = grid_svm.best_params_['clf__learning_rate']
best_max_iter = grid_svm.best_params_['clf__max_iter']
print(f"\nBest hyperparameters for SVM (SGD) Classifier - Penaly: {best_penalty}, Alpha: {best_alpha}, eta0: {best_eta0}, "
f"Learning Rate: {best_learning_rate}, Max Iter: {best_max_iter}")
# Update SGDClassifier with best hyperparameters
clf_svm_pipeline_red.set_params(clf__penalty=best_penalty)
clf_svm_pipeline_red.set_params(clf__alpha=best_alpha)
clf_svm_pipeline_red.set_params(clf__eta0=best_eta0)
clf_svm_pipeline_red.set_params(clf__learning_rate=best_learning_rate)
clf_svm_pipeline_red.set_params(clf__max_iter=best_max_iter)
print(f"SVM Classifier Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
# Hyperparameter tuning for MultinomialNB
start_time = time.time()
param_grid_nb = {
'alpha': np.arange(0.1, 1.1, 0.1),
'class_prior': [(0.3, 0.7), (0.4, 0.6), (0.46, 0.54), (0.5, 0.5), (0.6, 0.4), (0.7, 0.3)] # Yes=46%
}
grid_nb = GridSearchCV(
clf_nb, param_grid_nb, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True
)
# grid_nb = HalvingGridSearchCV(
# clf_nb, param_grid_nb, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000,
# return_train_score=True
# )
grid_nb.fit(X_train_super_reduced_tune, y_train_tune)
# Print accuracy and recall for each hyperparameter combination
print("\nMultinomialNB - Hyperparameter Performance:")
for alpha, class_prior, acc, auc, f1 in zip(
grid_nb.cv_results_['param_alpha'],
grid_nb.cv_results_['param_class_prior'],
grid_nb.cv_results_['mean_test_accuracy'],
grid_nb.cv_results_['mean_test_roc_auc'],
grid_nb.cv_results_['mean_test_f1']
):
print(f"Alpha: {alpha}, Class Prior: {class_prior}, Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best alpha and class_prior
best_alpha = grid_nb.best_params_['alpha']
best_class_prior = grid_nb.best_params_['class_prior']
print(f"\nBest alpha for MNB: {best_alpha}, Best class prior: {best_class_prior}")
# Update NB classifier with best hyperparameters
clf_nb = MultinomialNB(alpha=best_alpha, class_prior=best_class_prior)
print(f"NB Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
# Hyperparameter tuning for AdaBoost
start_time = time.time()
param_grid_ab = {'clf__n_estimators': [50, 100, 200],
'clf__learning_rate': [0.1, 0.5, 1.0, 1.5]}
grid_ab = GridSearchCV(
clf_dt_pipeline_red, param_grid_ab, cv=3, scoring=['accuracy', 'roc_auc', 'f1'], refit='accuracy',
return_train_score=True
)
# grid_ab = HalvingGridSearchCV(
# clf_dt_pipeline_red, param_grid_ab, cv=3, factor=2, scoring='accuracy',
# resource="n_samples", max_resources=40000,
# return_train_score=True
# )
grid_ab.fit(X_train_super_reduced_tune, y_train_tune)
# Print accuracy and recall for each hyperparameter combination
print("\nAdaBoost - Hyperparameter Performance:")
for n_estimators, learning_rate, acc, auc, f1 in zip(
grid_ab.cv_results_['param_clf__n_estimators'],
grid_ab.cv_results_['param_clf__learning_rate'],
grid_ab.cv_results_['mean_test_accuracy'],
grid_ab.cv_results_['mean_test_roc_auc'],
grid_ab.cv_results_['mean_test_f1']
):
print(f"n_estimators: {n_estimators}, learning_rate: {learning_rate}, Accuracy: {acc:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}")
# Get best n_estimators and learning_rate
best_n_estimators = grid_ab.best_params_['clf__n_estimators']
best_learning_rate = grid_ab.best_params_['clf__learning_rate']
print(f"\nBest n_estimators for AdaBoost: {best_n_estimators}, Best learning_rate: {best_learning_rate}")
# Update AdaBoost classifier with best n_estimators
clf_dt_pipeline_red.set_params(clf__n_estimators=best_n_estimators)
clf_dt_pipeline_red.set_params(clf__learning_rate=best_learning_rate)
print(f"AdaBoost Tuning Time: {(time.time() - start_time)/60:.2f} minutes")
Original training set size: 81412 Subset size for tuning: 40706 Results for Super Reduced Dataset Logistic Regression (SGD) Classifier - Hyperparameter Performance: Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9957, AUC: 0.9974, F1: 0.9953 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9933, AUC: 0.9965, F1: 0.9927 Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9801, AUC: 0.9930, F1: 0.9780 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9802, AUC: 0.9926, F1: 0.9781 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9957, AUC: 0.9974, F1: 0.9953 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9933, AUC: 0.9965, F1: 0.9927 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9928, AUC: 0.9962, F1: 0.9921 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9919, AUC: 0.9955, F1: 0.9911 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9859, AUC: 0.9922, F1: 0.9845 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9794, AUC: 0.9889, F1: 0.9772 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9773, AUC: 0.9858, F1: 0.9749 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9769, AUC: 0.9847, F1: 0.9744 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9859, AUC: 0.9922, F1: 0.9845 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9794, AUC: 0.9889, F1: 0.9772 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9832, AUC: 0.9924, F1: 0.9815 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9794, AUC: 0.9888, F1: 0.9772 Penalty: l1, Alpha: 0.01, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9468, AUC: 0.9605, F1: 0.9390 Penalty: l2, Alpha: 0.01, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9587, AUC: 0.9777, F1: 0.9532 Penalty: l1, Alpha: 0.01, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9468, AUC: 0.9602, F1: 0.9390 Penalty: l2, Alpha: 0.01, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9586, AUC: 0.9771, F1: 0.9532 Penalty: l1, Alpha: 0.01, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9468, AUC: 0.9605, F1: 0.9390 Penalty: l2, Alpha: 0.01, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9587, AUC: 0.9777, F1: 0.9532 Penalty: l1, Alpha: 0.01, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9468, AUC: 0.9602, F1: 0.9390 Penalty: l2, Alpha: 0.01, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9586, AUC: 0.9772, F1: 0.9532 Best hyperparameters for Logistic Regression (SGD) Classifier - Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500 LR Classifier Tuning Time: 0.03 minutes SVM (SGD) Classifier - Hyperparameter Performance: Penalty: l1, Alpha: 1e-05, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9736, AUC: 0.9934, F1: 0.9716 Penalty: l2, Alpha: 1e-05, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9743, AUC: 0.9958, F1: 0.9719 Penalty: l1, Alpha: 1e-05, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9957, AUC: 0.9977, F1: 0.9953 Penalty: l2, Alpha: 1e-05, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9944, AUC: 0.9968, F1: 0.9940 Penalty: l1, Alpha: 1e-05, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9736, AUC: 0.9934, F1: 0.9716 Penalty: l2, Alpha: 1e-05, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9743, AUC: 0.9958, F1: 0.9719 Penalty: l1, Alpha: 1e-05, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9964, AUC: 0.9982, F1: 0.9961 Penalty: l2, Alpha: 1e-05, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9946, AUC: 0.9971, F1: 0.9941 Penalty: l1, Alpha: 1e-05, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9736, AUC: 0.9934, F1: 0.9716 Penalty: l2, Alpha: 1e-05, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9743, AUC: 0.9958, F1: 0.9719 Penalty: l1, Alpha: 1e-05, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9950, AUC: 0.9983, F1: 0.9946 Penalty: l2, Alpha: 1e-05, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9921, AUC: 0.9969, F1: 0.9915 Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9607, AUC: 0.9868, F1: 0.9575 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9751, AUC: 0.9944, F1: 0.9727 Penalty: l1, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9945, AUC: 0.9969, F1: 0.9940 Penalty: l2, Alpha: 0.0001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9933, AUC: 0.9965, F1: 0.9927 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9607, AUC: 0.9868, F1: 0.9575 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9751, AUC: 0.9944, F1: 0.9727 Penalty: l1, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9979, AUC: 0.9987, F1: 0.9978 Penalty: l2, Alpha: 0.0001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9943, AUC: 0.9965, F1: 0.9938 Penalty: l1, Alpha: 0.0001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9607, AUC: 0.9868, F1: 0.9575 Penalty: l2, Alpha: 0.0001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9751, AUC: 0.9944, F1: 0.9727 Penalty: l1, Alpha: 0.0001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9981, AUC: 0.9986, F1: 0.9979 Penalty: l2, Alpha: 0.0001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9753, AUC: 0.9933, F1: 0.9732 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9518, AUC: 0.9797, F1: 0.9477 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9806, AUC: 0.9904, F1: 0.9785 Penalty: l1, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9864, AUC: 0.9953, F1: 0.9850 Penalty: l2, Alpha: 0.001, eta0: 0.001, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9824, AUC: 0.9903, F1: 0.9806 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9518, AUC: 0.9797, F1: 0.9477 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9806, AUC: 0.9904, F1: 0.9785 Penalty: l1, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9935, AUC: 0.9957, F1: 0.9929 Penalty: l2, Alpha: 0.001, eta0: 0.01, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9799, AUC: 0.9881, F1: 0.9778 Penalty: l1, Alpha: 0.001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9518, AUC: 0.9797, F1: 0.9477 Penalty: l2, Alpha: 0.001, eta0: 0.1, Learning Rate: optimal, Max Iter: 500, Accuracy: 0.9806, AUC: 0.9904, F1: 0.9785 Penalty: l1, Alpha: 0.001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9942, AUC: 0.9964, F1: 0.9937 Penalty: l2, Alpha: 0.001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500, Accuracy: 0.9793, AUC: 0.9881, F1: 0.9771 Best hyperparameters for SVM (SGD) Classifier - Penaly: l1, Alpha: 0.0001, eta0: 0.1, Learning Rate: adaptive, Max Iter: 500 SVM Classifier Tuning Time: 4.24 minutes MultinomialNB - Hyperparameter Performance: Alpha: 0.1, Class Prior: (0.3, 0.7), Accuracy: 0.5312, AUC: 0.7417, F1: 0.6120 Alpha: 0.1, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7417, F1: 0.5774 Alpha: 0.1, Class Prior: (0.46, 0.54), Accuracy: 0.5581, AUC: 0.7417, F1: 0.5997 Alpha: 0.1, Class Prior: (0.5, 0.5), Accuracy: 0.7885, AUC: 0.7417, F1: 0.7483 Alpha: 0.1, Class Prior: (0.6, 0.4), Accuracy: 0.8098, AUC: 0.7417, F1: 0.7537 Alpha: 0.1, Class Prior: (0.7, 0.3), Accuracy: 0.8119, AUC: 0.7417, F1: 0.7447 Alpha: 0.2, Class Prior: (0.3, 0.7), Accuracy: 0.5314, AUC: 0.7417, F1: 0.6124 Alpha: 0.2, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7417, F1: 0.5774 Alpha: 0.2, Class Prior: (0.46, 0.54), Accuracy: 0.5581, AUC: 0.7417, F1: 0.5997 Alpha: 0.2, Class Prior: (0.5, 0.5), Accuracy: 0.7885, AUC: 0.7417, F1: 0.7483 Alpha: 0.2, Class Prior: (0.6, 0.4), Accuracy: 0.8098, AUC: 0.7417, F1: 0.7537 Alpha: 0.2, Class Prior: (0.7, 0.3), Accuracy: 0.8118, AUC: 0.7417, F1: 0.7446 Alpha: 0.30000000000000004, Class Prior: (0.3, 0.7), Accuracy: 0.5314, AUC: 0.7417, F1: 0.6124 Alpha: 0.30000000000000004, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7417, F1: 0.5774 Alpha: 0.30000000000000004, Class Prior: (0.46, 0.54), Accuracy: 0.5581, AUC: 0.7417, F1: 0.5997 Alpha: 0.30000000000000004, Class Prior: (0.5, 0.5), Accuracy: 0.7885, AUC: 0.7417, F1: 0.7483 Alpha: 0.30000000000000004, Class Prior: (0.6, 0.4), Accuracy: 0.8098, AUC: 0.7417, F1: 0.7537 Alpha: 0.30000000000000004, Class Prior: (0.7, 0.3), Accuracy: 0.8118, AUC: 0.7417, F1: 0.7446 Alpha: 0.4, Class Prior: (0.3, 0.7), Accuracy: 0.5314, AUC: 0.7417, F1: 0.6124 Alpha: 0.4, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7417, F1: 0.5774 Alpha: 0.4, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7417, F1: 0.5996 Alpha: 0.4, Class Prior: (0.5, 0.5), Accuracy: 0.7885, AUC: 0.7417, F1: 0.7483 Alpha: 0.4, Class Prior: (0.6, 0.4), Accuracy: 0.8097, AUC: 0.7417, F1: 0.7536 Alpha: 0.4, Class Prior: (0.7, 0.3), Accuracy: 0.8118, AUC: 0.7417, F1: 0.7446 Alpha: 0.5, Class Prior: (0.3, 0.7), Accuracy: 0.5314, AUC: 0.7417, F1: 0.6123 Alpha: 0.5, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7417, F1: 0.5774 Alpha: 0.5, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7417, F1: 0.5996 Alpha: 0.5, Class Prior: (0.5, 0.5), Accuracy: 0.7884, AUC: 0.7417, F1: 0.7482 Alpha: 0.5, Class Prior: (0.6, 0.4), Accuracy: 0.8097, AUC: 0.7417, F1: 0.7536 Alpha: 0.5, Class Prior: (0.7, 0.3), Accuracy: 0.8118, AUC: 0.7417, F1: 0.7446 Alpha: 0.6000000000000001, Class Prior: (0.3, 0.7), Accuracy: 0.5312, AUC: 0.7416, F1: 0.6121 Alpha: 0.6000000000000001, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7416, F1: 0.5774 Alpha: 0.6000000000000001, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7416, F1: 0.5996 Alpha: 0.6000000000000001, Class Prior: (0.5, 0.5), Accuracy: 0.7884, AUC: 0.7416, F1: 0.7482 Alpha: 0.6000000000000001, Class Prior: (0.6, 0.4), Accuracy: 0.8097, AUC: 0.7416, F1: 0.7536 Alpha: 0.6000000000000001, Class Prior: (0.7, 0.3), Accuracy: 0.8118, AUC: 0.7416, F1: 0.7446 Alpha: 0.7000000000000001, Class Prior: (0.3, 0.7), Accuracy: 0.5312, AUC: 0.7416, F1: 0.6121 Alpha: 0.7000000000000001, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7416, F1: 0.5774 Alpha: 0.7000000000000001, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7416, F1: 0.5996 Alpha: 0.7000000000000001, Class Prior: (0.5, 0.5), Accuracy: 0.7884, AUC: 0.7416, F1: 0.7482 Alpha: 0.7000000000000001, Class Prior: (0.6, 0.4), Accuracy: 0.8106, AUC: 0.7416, F1: 0.7549 Alpha: 0.7000000000000001, Class Prior: (0.7, 0.3), Accuracy: 0.8118, AUC: 0.7416, F1: 0.7446 Alpha: 0.8, Class Prior: (0.3, 0.7), Accuracy: 0.5315, AUC: 0.7416, F1: 0.6125 Alpha: 0.8, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7416, F1: 0.5774 Alpha: 0.8, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7416, F1: 0.5996 Alpha: 0.8, Class Prior: (0.5, 0.5), Accuracy: 0.7884, AUC: 0.7416, F1: 0.7482 Alpha: 0.8, Class Prior: (0.6, 0.4), Accuracy: 0.8106, AUC: 0.7416, F1: 0.7549 Alpha: 0.8, Class Prior: (0.7, 0.3), Accuracy: 0.8117, AUC: 0.7416, F1: 0.7443 Alpha: 0.9, Class Prior: (0.3, 0.7), Accuracy: 0.5315, AUC: 0.7416, F1: 0.6125 Alpha: 0.9, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7416, F1: 0.5773 Alpha: 0.9, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7416, F1: 0.5996 Alpha: 0.9, Class Prior: (0.5, 0.5), Accuracy: 0.7884, AUC: 0.7416, F1: 0.7482 Alpha: 0.9, Class Prior: (0.6, 0.4), Accuracy: 0.8105, AUC: 0.7416, F1: 0.7549 Alpha: 0.9, Class Prior: (0.7, 0.3), Accuracy: 0.8116, AUC: 0.7416, F1: 0.7442 Alpha: 1.0, Class Prior: (0.3, 0.7), Accuracy: 0.5315, AUC: 0.7416, F1: 0.6125 Alpha: 1.0, Class Prior: (0.4, 0.6), Accuracy: 0.5017, AUC: 0.7416, F1: 0.5773 Alpha: 1.0, Class Prior: (0.46, 0.54), Accuracy: 0.5580, AUC: 0.7416, F1: 0.5996 Alpha: 1.0, Class Prior: (0.5, 0.5), Accuracy: 0.7884, AUC: 0.7416, F1: 0.7482 Alpha: 1.0, Class Prior: (0.6, 0.4), Accuracy: 0.8105, AUC: 0.7416, F1: 0.7549 Alpha: 1.0, Class Prior: (0.7, 0.3), Accuracy: 0.8116, AUC: 0.7416, F1: 0.7441 Best alpha for MNB: 0.1, Best class prior: (0.7, 0.3) NB Tuning Time: 0.06 minutes AdaBoost - Hyperparameter Performance: n_estimators: 50, learning_rate: 0.1, Accuracy: 0.7694, AUC: 0.7507, F1: 0.6679 n_estimators: 100, learning_rate: 0.1, Accuracy: 0.7694, AUC: 0.7507, F1: 0.6679 n_estimators: 200, learning_rate: 0.1, Accuracy: 0.7694, AUC: 0.8455, F1: 0.6679 n_estimators: 50, learning_rate: 0.5, Accuracy: 0.7694, AUC: 0.8455, F1: 0.6679 n_estimators: 100, learning_rate: 0.5, Accuracy: 0.7694, AUC: 0.8455, F1: 0.6679 n_estimators: 200, learning_rate: 0.5, Accuracy: 0.7694, AUC: 0.8455, F1: 0.6679 n_estimators: 50, learning_rate: 1.0, Accuracy: 0.8343, AUC: 0.8455, F1: 0.8025 n_estimators: 100, learning_rate: 1.0, Accuracy: 0.8343, AUC: 0.8455, F1: 0.8025 n_estimators: 200, learning_rate: 1.0, Accuracy: 0.8343, AUC: 0.8455, F1: 0.8025 n_estimators: 50, learning_rate: 1.5, Accuracy: 0.9534, AUC: 0.9744, F1: 0.9470 n_estimators: 100, learning_rate: 1.5, Accuracy: 0.9607, AUC: 0.9749, F1: 0.9556 n_estimators: 200, learning_rate: 1.5, Accuracy: 0.9607, AUC: 0.9749, F1: 0.9556 Best n_estimators for AdaBoost: 100, Best learning_rate: 1.5 AdaBoost Tuning Time: 0.44 minutes
Cross-Validation with Super Reduced Dataset¶
start_time_total = time.time() # Track total time
# Define Stratified K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
# Dictionary to store model training times and accuracy scores
cv_times = {}
cv_accuracy_scores = {} # Store accuracy scores for each model
# Cross-validation: Recall, Accuracy, F1
print("\n10-fold Cross-Validation Metrics:\n")
for clf, label in zip([clf_lr_pipeline_red, clf_svm_pipeline_red, clf_nb, clf_dt_pipeline_red], clf_labels):
start_time = time.time() # Start timing for the model
# Compute and store accuracy scores
accuracy_scores = cross_val_score(estimator=clf, X=X_train_super_reduced, y=y_train, cv=skf, scoring='accuracy', n_jobs=-1)
cv_accuracy_scores[label] = accuracy_scores # Store for later use
# Compute and display AUC and F1 scores
roc_auc_scores = cross_val_score(estimator=clf, X=X_train_super_reduced, y=y_train, cv=skf, scoring='roc_auc', n_jobs=-1)
f1_scores = cross_val_score(estimator=clf, X=X_train_super_reduced, y=y_train, cv=skf, scoring='f1', n_jobs=-1)
end_time = time.time() # End timing
# Store elapsed time in minutes
cv_times[label] = (end_time - start_time) / 60
print(f"{label}:")
print(f" CV Accuracy: {accuracy_scores.mean():.3f} (+/- {accuracy_scores.std():.3f})")
print(f" CV AUC: {roc_auc_scores.mean():.3f} (+/- {roc_auc_scores.std():.3f})")
print(f" CV F1 Score: {f1_scores.mean():.3f} (+/- {f1_scores.std():.3f})\n")
# Print Cross-Validation Timing Per Model
print("\nCross-Validation Time per Model (in minutes):")
for model, time_taken in cv_times.items():
print(f"{model}: {time_taken:.2f} minutes")
# Print Total Time Taken
end_time_total = time.time()
print(f"\nTotal Elapsed Time for Cross-Validation: {(end_time_total - start_time_total)/60:.2f} minutes")
10-fold Cross-Validation Metrics: SGD Logistic Regression: CV Accuracy: 0.996 (+/- 0.001) CV AUC: 0.998 (+/- 0.000) CV F1 Score: 0.996 (+/- 0.001) SGD Support Vector Machine: CV Accuracy: 0.997 (+/- 0.001) CV AUC: 0.998 (+/- 0.000) CV F1 Score: 0.996 (+/- 0.001) Multinomial Naive Bayes: CV Accuracy: 0.810 (+/- 0.004) CV AUC: 0.735 (+/- 0.008) CV F1 Score: 0.741 (+/- 0.007) AdaBoost Decision Tree: CV Accuracy: 0.966 (+/- 0.002) CV AUC: 0.975 (+/- 0.002) CV F1 Score: 0.961 (+/- 0.003) Cross-Validation Time per Model (in minutes): SGD Logistic Regression: 0.03 minutes SGD Support Vector Machine: 0.04 minutes Multinomial Naive Bayes: 0.01 minutes AdaBoost Decision Tree: 0.07 minutes Total Elapsed Time for Cross-Validation: 0.14 minutes
Statistical Comparison of Models¶
# Extract accuracy scores from dictionary
lr_acc = cv_accuracy_scores["SGD Logistic Regression"]
svm_acc = cv_accuracy_scores["SGD Support Vector Machine"]
nb_acc = cv_accuracy_scores["Multinomial Naive Bayes"]
dt_acc = cv_accuracy_scores["AdaBoost Decision Tree"]
# Define all model pairs for pairwise comparisons
model_pairs = [
("SGD Logistic Regression", "SGD Support Vector Machine", lr_acc, svm_acc),
("SGD Logistic Regression", "Multinomial Naive Bayes", lr_acc, nb_acc),
("SGD Logistic Regression", "AdaBoost Decision Tree", lr_acc, dt_acc),
("SGD Support Vector Machine", "Multinomial Naive Bayes", svm_acc, nb_acc),
("SGD Support Vector Machine", "AdaBoost Decision Tree", svm_acc, dt_acc),
("Multinomial Naive Bayes", "AdaBoost Decision Tree", nb_acc, dt_acc),
]
# Perform paired t-tests
p_values = []
t_stats = []
print("\nPairwise t-test results:")
for name1, name2, acc1, acc2 in model_pairs:
t_stat, p_val = ttest_rel(acc1, acc2)
t_stats.append(t_stat)
p_values.append(p_val)
print(f"{name1} vs {name2}: t = {t_stat:.3f}, p = {p_val:.5f}")
# Apply Bonferroni correction
_, p_corrected_bonferroni, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')
print("\nBonferroni-corrected p-values:")
for i, (name1, name2, _, _) in enumerate(model_pairs):
print(f"{name1} vs {name2}: p = {p_corrected_bonferroni[i]:.5f}")
# === Tukey's HSD Test ===
# Combine all accuracy scores into a single array
all_scores = np.concatenate([lr_acc, svm_acc, nb_acc, dt_acc])
# Create labels for each model (repeated for each fold)
model_labels = (["SGD Logistic Regression"] * len(lr_acc) +
["SGD Support Vector Machine"] * len(svm_acc) +
["Multinomial Naive Bayes"] * len(nb_acc) +
["AdaBoost Decision Tree"] * len(dt_acc))
# Perform Tukey's HSD test
tukey_results = pairwise_tukeyhsd(all_scores, model_labels, alpha=0.05)
print("\nTukey's HSD Test Results:")
print(tukey_results)
Pairwise t-test results:
SGD Logistic Regression vs SGD Support Vector Machine: t = -7.250, p = 0.00005
SGD Logistic Regression vs Multinomial Naive Bayes: t = 154.749, p = 0.00000
SGD Logistic Regression vs AdaBoost Decision Tree: t = 37.336, p = 0.00000
SGD Support Vector Machine vs Multinomial Naive Bayes: t = 150.741, p = 0.00000
SGD Support Vector Machine vs AdaBoost Decision Tree: t = 38.209, p = 0.00000
Multinomial Naive Bayes vs AdaBoost Decision Tree: t = -145.486, p = 0.00000
Bonferroni-corrected p-values:
SGD Logistic Regression vs SGD Support Vector Machine: p = 0.00029
SGD Logistic Regression vs Multinomial Naive Bayes: p = 0.00000
SGD Logistic Regression vs AdaBoost Decision Tree: p = 0.00000
SGD Support Vector Machine vs Multinomial Naive Bayes: p = 0.00000
SGD Support Vector Machine vs AdaBoost Decision Tree: p = 0.00000
Multinomial Naive Bayes vs AdaBoost Decision Tree: p = 0.00000
Tukey's HSD Test Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
=========================================================================================
group1 group2 meandiff p-adj lower upper reject
-----------------------------------------------------------------------------------------
AdaBoost Decision Tree Multinomial Naive Bayes -0.1558 0.0 -0.1587 -0.1528 True
AdaBoost Decision Tree SGD Logistic Regression 0.0304 0.0 0.0274 0.0333 True
AdaBoost Decision Tree SGD Support Vector Machine 0.031 0.0 0.028 0.034 True
Multinomial Naive Bayes SGD Logistic Regression 0.1861 0.0 0.1832 0.1891 True
Multinomial Naive Bayes SGD Support Vector Machine 0.1868 0.0 0.1838 0.1897 True
SGD Logistic Regression SGD Support Vector Machine 0.0007 0.9335 -0.0023 0.0036 False
-----------------------------------------------------------------------------------------
Same results as found for the dataset including more features:
- Paired t-tests with Bonferroni correction found a statistically significant difference between all models.
- Tukey’s HSD Test did not find a significant difference between Logistic Regression and SVM. It confirmed Naive Bayes and AdaBoost were significantly different from the SGD-based models.
Model Evaluation with Super Reduced Dataset¶
# Fit the models
clf_lr_pipeline_red.fit(X_train_super_reduced, y_train)
clf_svm_pipeline_red.fit(X_train_super_reduced, y_train)
clf_nb.fit(X_train_super_reduced, y_train)
clf_dt_pipeline_red.fit(X_train_super_reduced, y_train)
# Create subplots for CM, ROC, and PR curves
fig, axes = plt.subplots(4, 3, figsize=(15, 20)) # 4 rows, 3 columns (CM, PR, ROC)
for i, (clf, label) in enumerate(zip([clf_lr_pipeline_red, clf_svm_pipeline_red, clf_nb, clf_dt_pipeline_red], clf_labels)):
# Predictions
y_test_pred = clf.predict(X_test_super_reduced)
# Use predict_proba() if available; otherwise, use decision_function()
if hasattr(clf, "predict_proba"):
y_test_prob = clf.predict_proba(X_test_super_reduced)[:, 1] # Probabilities for positive class
else:
y_test_prob = clf.decision_function(X_test_super_reduced) # Decision scores (not probabilities)
# Confusion Matrix
cm_test = mt.confusion_matrix(y_test, y_test_pred)
cm_display = mt.ConfusionMatrixDisplay(cm_test)
axes[i, 0].set_title(f"Confusion Matrix - {label}")
cm_display.plot(ax=axes[i, 0], cmap="Blues", colorbar=False)
# ROC Curve
fpr, tpr, _ = mt.roc_curve(y_test, y_test_prob)
axes[i, 1].plot(fpr, tpr, label=f"{label} (AUC={mt.roc_auc_score(y_test, y_test_prob):.2f})")
axes[i, 1].set_title("ROC Curve")
axes[i, 1].set_xlabel("False Positive Rate")
axes[i, 1].set_ylabel("True Positive Rate")
axes[i, 1].legend(loc="best")
# Precision-Recall Curve
precision, recall, _ = mt.precision_recall_curve(y_test, y_test_prob)
axes[i, 2].plot(recall, precision, label=f"{label}")
axes[i, 2].set_title("Precision-Recall Curve")
axes[i, 2].set_xlabel("Recall")
axes[i, 2].set_ylabel("Precision")
axes[i, 2].legend(loc="best")
# Print final metrics for comparison
print(f"\nPerformance Metrics for {label}:")
print(f" Accuracy: {mt.accuracy_score(y_test, y_test_pred):.4f}")
print(f" ROC AUC: {mt.roc_auc_score(y_test, y_test_prob):.4f}")
print(f" F1 Score: {mt.f1_score(y_test, y_test_pred):.4f}")
print(f" Precision: {mt.precision_score(y_test, y_test_pred):.4f}")
print(f" Recall: {mt.recall_score(y_test, y_test_pred):.4f}")
print(f"\n")
plt.tight_layout()
plt.show()
Performance Metrics for SGD Logistic Regression: Accuracy: 0.9961 ROC AUC: 0.9977 F1 Score: 0.9958 Precision: 1.0000 Recall: 0.9916 Performance Metrics for SGD Support Vector Machine: Accuracy: 0.9966 ROC AUC: 0.9976 F1 Score: 0.9962 Precision: 1.0000 Recall: 0.9925 Performance Metrics for Multinomial Naive Bayes: Accuracy: 0.8107 ROC AUC: 0.7349 F1 Score: 0.7409 Precision: 0.9998 Recall: 0.5885 Performance Metrics for AdaBoost Decision Tree: Accuracy: 0.9689 ROC AUC: 0.9746 F1 Score: 0.9650 Precision: 1.0000 Recall: 0.9323
Takeaways:
- SGD Logistic Regression & SGD SVM
- Nearly perfect classification with no false positives and very few false negatives.
- Logistic Regression: 0 False Positives / 79 False Negatives.
- SVM: 0 False Positives / 70 False Negatives.
- ROC AUC = 1.00, near-perfect class separation.
- PR Curve is ideal, showing near-perfect recall and precision.
- Nearly perfect classification with no false positives and very few false negatives.
- Multinomial Naive Bayes
- More false negatives (3,852) compared to SGD models.
- ROC AUC = 0.73, lower than the 0.93 when more variables were included.
- AdaBoost Decision Tree
- Fewer false negatives (634) than Naive Bayes or the AdaBoost including more features. No false positives.
- ROC AUC = 0.97 higher than the 0.90 using more features.
- SGD Logistic Regression and SVM consistently performed best, with SVM slightly outperforming LR in accuracy when more features were included.
- Reducing features improved AdaBoost significantly (from 83.3% to 96.9% accuracy), likely by reducing overfitting improving model generalization.
- Logistic Regression was much faster than SVM and AdaBoost.
- Naive Bayes was the fastest model but consistently the weakest. Surprisingly, its performance dropped when reducing features.
- All except the Naive Bayes model performed similarly or better after reducing the number of features from those selected by feature selection to eight medication variables.
- All models predicted the negative class better than the positive class, as indicated by the higher number of false negatives compared to false positives, leading to slightly higher precision than recall.
- All of the models performed similarly comparing the metrics from cross-validation on the training set and the metrics from the heldout testing set.
Recommendation: Though SVM slightly outperformed Logistic Regression and AdaBoost Decision Tree models, we would recommend the Logistic Regression model for the following reasons.
- It is faster and computationally more efficient.
- It is very interpretable.
- It performs well with both a reduced dataset of very important features and a more expanded dataset with feature of lower importance.
- It generalizes well, performing consistently well using cross-validation on training data and when evaluated on unseen testing data.
Data Preparation: Regression Task¶
Our second task is predicting the number of days a patient will be hospitalized, the variable time_in_hospital.
Preprocessing: Separating the Response Variable, Encoding and Scaling¶
# Get a copy of the data prior preprocessing for classification
df2 = df_clean.copy()
# Extract response variable
y_time = df2['time_in_hospital']
X_time = df2.drop(columns=['time_in_hospital'])
# Define categorical and numerical feature subsets
categorical_cols = X_time.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_time.select_dtypes(include=['int64', 'float64']).columns
# One-Hot Encoding categorical variables
X_time_encoded = pd.get_dummies(X_time, columns=categorical_cols, drop_first=True) # drop_first for multicollinearity issues
# Define the preprocessing pipeline
# Apply StandardScaler only to numerical columns while keeping categorical (one-hot encoded) features unchanged
preprocessor = ColumnTransformer([('num', StandardScaler(), numerical_cols)], remainder='passthrough')
- The response variable
time_in_hospitalwas separated from the explanatory features. - Categorical variables were encoded, and scaling was applied to numerical variables for appropriate models.
Review Summary Statistics of the Response¶
# Display summary statistics for the response variable
y_summary = y_time.describe()
# Print summary statistics
print("\nSummary Statistics for 'time_in_hospital':")
print(y_summary)
# Plot distribution of response variable
plt.figure()
plt.hist(y_time, edgecolor='black', alpha=0.7)
plt.xlabel("Days in Hospital")
plt.ylabel("Frequency")
plt.title("Distribution of 'time_in_hospital'")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
Summary Statistics for 'time_in_hospital': count 101766.000000 mean 4.395987 std 2.985108 min 1.000000 25% 2.000000 50% 4.000000 75% 6.000000 max 14.000000 Name: time_in_hospital, dtype: float64
- The response variable is right skewed.
Strategy for Performance Validation¶
- The dataset was split into training and holdout sets (80/20 split).
- Lasso feature selection was performed with 5-fold CV.
- Hyperparameter tuning was conducted using different approaches for efficiency:
- 5-fold CV was used for models where
GridSearchCVwas implemented. - 7-fold CV was used for models where
RandomizedSearchCVwas implemented, allowing a more computationally efficient search over a wider hyperparameter space.
- 5-fold CV was used for models where
- 10-fold CV was performed with tuned parameters to statistically compare the MSE of the models.
- The holdout test set (20%) was reserved for final model evaluation after tuning.
- Given the dataset size (~100,000 records), an 80/20 split was chosen to:
- Ensure sufficient samples for training and validation, while leaving enough data for an unbiased test set.
- Preserve representation across categories, ensuring the training set captures the variance in categorical features.
Split the Data into a Training and Holdout Set Prior to Feature Selection¶
# Split into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_time_encoded, y_time, test_size=0.2, random_state=1234)
Feature Selection¶
# ----- LASSO Feature Selection -----
start_time = time.time()
lasso_selector = LassoCV(cv=5, random_state=1234)
# Create a pipeline with preprocessing and LASSO
lasso_pipeline = Pipeline([
('preprocessor', preprocessor),
('clf', lasso_selector)
])
# Fit the pipeline
lasso_pipeline.fit(X_train, y_train)
# Retrieve feature names after transformation
feature_names = numerical_cols.tolist() + [col for col in X_train.columns if col not in numerical_cols]
# Select features with non-zero coefficients
lasso_selected_features = [feature_names[i] for i in range(len(feature_names)) if lasso_selector.coef_[i] != 0]
print(f"LASSO time: {(time.time() - start_time)/60:.2f} minutes")
# ----- Random Forest Feature Selection -----
start_time = time.time()
rf_selector = RandomForestRegressor(n_estimators=100, random_state=1234)
rf_selector.fit(X_train, y_train)
# Get feature importances and select top features
feature_importances = pd.Series(rf_selector.feature_importances_, index=X_train.columns)
rf_selected_features = feature_importances[feature_importances > np.percentile(feature_importances, 75)].index.tolist() # Top 25% features
print(f"RF time: {(time.time() - start_time)/60:.2f} minutes")
# - `lasso_selected_features` is the list of features selected by LASSO
# - `rf_selected_features` is the list of features selected by Random Forest
# - `X_time` is the original dataset before encoding
# - `categorical_cols` and `numerical_cols` are already defined
# Convert feature lists to DataFrames for processing
lasso_df = pd.DataFrame({'Feature': lasso_selected_features})
rf_df = pd.DataFrame({'Feature': rf_selected_features})
# ----- Extract Original Variable Names for LASSO -----
lasso_numeric = lasso_df['Feature'][lasso_df['Feature'].isin(numerical_cols)]
lasso_categorical = lasso_df['Feature'][~lasso_df['Feature'].isin(numerical_cols)]
lasso_original_categorical = lasso_categorical.str.replace(r'_[^_]+$', '', regex=True) # Remove one-hot encoding suffixes
# Combine numeric and cleaned categorical features
lasso_unique_features = pd.concat([lasso_numeric, lasso_original_categorical]).unique()
# ----- Extract Original Variable Names for Random Forest -----
rf_numeric = rf_df['Feature'][rf_df['Feature'].isin(numerical_cols)]
rf_categorical = rf_df['Feature'][~rf_df['Feature'].isin(numerical_cols)]
rf_original_categorical = rf_categorical.str.replace(r'_[^_]+$', '', regex=True)
# Combine numeric and cleaned categorical features
rf_unique_features = pd.concat([rf_numeric, rf_original_categorical]).unique()
# ----- Find Common & Unused Features -----
dataset_features = pd.Series(X_time.columns) # Original dataset BEFORE encoding
# Features selected by both LASSO & RF
common_features = pd.Series(list(set(lasso_unique_features) & set(rf_unique_features)))
# Features in dataset but not selected by either method
unused_features = dataset_features[~dataset_features.isin(pd.concat([pd.Series(lasso_unique_features), pd.Series(rf_unique_features)]))]
# Print Results
print("\nLASSO Unique Selected Features (Original Variables):")
print(lasso_unique_features)
print("\nRandom Forest Unique Selected Features (Original Variables):")
print(rf_unique_features)
print("\nCommon Features Selected by Both LASSO and RF:")
print(common_features.to_list())
print("\nUnused Features (Present in Dataset but NOT Selected by LASSO or RF):")
print(unused_features.to_list())
LASSO time: 0.41 minutes RF time: 7.45 minutes LASSO Unique Selected Features (Original Variables): ['num_lab_procedures' 'num_procedures' 'num_medications' 'number_outpatient' 'number_emergency' 'number_inpatient' 'number_diagnoses' 'race' 'age' 'admission_type_id' 'discharge_disposition_id' 'admission_source_id' 'payer_code' 'medical_specialty' 'diag_1' 'diag_2' 'diag_3' 'max_glu_serum' 'A1Cresult' 'metformin' 'repaglinide' 'glimepiride' 'glipizide' 'glyburide' 'pioglitazone' 'rosiglitazone' 'insulin' 'change' 'diabetesMed' 'readmitted'] Random Forest Unique Selected Features (Original Variables): ['num_lab_procedures' 'num_procedures' 'num_medications' 'number_outpatient' 'number_emergency' 'number_inpatient' 'number_diagnoses' 'race' 'gender' 'age' 'admission_type_id' 'discharge_disposition_id' 'admission_source_id' 'payer_code' 'medical_specialty' 'diag_1' 'diag_2' 'diag_3' 'max_glu_serum' 'A1Cresult' 'metformin' 'repaglinide' 'nateglinide' 'glimepiride' 'glipizide' 'glyburide' 'pioglitazone' 'rosiglitazone' 'acarbose' 'insulin' 'glyburide-metformin' 'change' 'diabetesMed' 'readmitted'] Common Features Selected by Both LASSO and RF: ['insulin', 'number_outpatient', 'admission_source_id', 'repaglinide', 'discharge_disposition_id', 'diag_2', 'rosiglitazone', 'glyburide', 'max_glu_serum', 'diag_1', 'num_procedures', 'diabetesMed', 'race', 'num_lab_procedures', 'admission_type_id', 'payer_code', 'change', 'glimepiride', 'pioglitazone', 'number_diagnoses', 'number_emergency', 'glipizide', 'number_inpatient', 'A1Cresult', 'medical_specialty', 'readmitted', 'age', 'metformin', 'num_medications', 'diag_3'] Unused Features (Present in Dataset but NOT Selected by LASSO or RF): ['chlorpropamide', 'acetohexamide', 'tolbutamide', 'miglitol', 'troglitazone', 'tolazamide', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']
Create a Reduced Dataset with Important Variables¶
# Train/test split was already created. Do this to avoid leakage.
# Identify encoded features corresponding to lasso-selected original features
selected_encoded_features = [col for col in X_train.columns if any(feature in col for feature in lasso_unique_features)]
# Create a Reduced Dataset with Encoded Features
X_train_selected = X_train[selected_encoded_features].copy()
X_test_selected = X_test[selected_encoded_features].copy() # Ensure same columns in test set
# Ensure train and test have the same columns (Test set may lack some categories)
X_test_selected = X_test_selected.reindex(columns=X_train_selected.columns, fill_value=0)
# Final check
print("Final Training Shape:", X_train_selected.shape)
print("Final Testing Shape:", X_test_selected.shape)
Final Training Shape: (81412, 2362) Final Testing Shape: (20354, 2362)
- Feature selection was performed using both LASSO penalized regression and Random Forest Regression to identify the most important variables.
- The selected features were highly similar between both methods, indicating consistency in variable importance.
- The LASSO-selected feature set was chosen as the final set because it resulted in a more reduced set of features, helping simplify the model while retaining predictive strength.
Modeling and Evaluation: Regression Task¶
Define Regressors¶
We tested five regression models:
- LASSO Regression (LassoCV)
- Selected for its ability to perform feature selection by penalizing less important variables, leading to a simpler, more interpretable model.
- Random Forest Regressor
- Chosen for its ability to capture complexity and for being less sensitive to outliers.
RandomizedSearchCVwas used for hyperparameter tuning to improve efficiency.
- Support Vector Regression (SVR)
- Included only during hyperparameter tuning to test how a kernel-based approach compared to the linear models.
- Subsampling was used to improve computational efficiency.
- Stochastic Gradient Descent Regressor (SGDRegressor)
- Chosen as a alternative to standard linear regression for use with a medium to large dataset.
- XGBoost Regressor
- Included for its high flexibility and efficiency.
RandomizedSearchCVwas used for hyperparameter tuning to improve efficiency.
Tune Hyperparameters¶
Mean squared error (MSE) (or negative MSE) was used as the tuning metric.
For each model, hyperparameters were tuned to improve performance. Key parameters included:
- LASSO Regression (LassoCV)
alpha: Regularization strength (controls the penalty on large coefficients; smaller values allow more flexibility but increase overfitting risk, while larger values result in stronger regularization, reducing complexity).
- Random Forest Regressor
n_estimators: Number of trees in the forest (higher values improve stability but increase training time).max_depth: Limits tree depth to prevent overfitting (deeper trees capture more patterns but risk capturing noise).min_samples_split: Minimum samples required to split a node (higher values encourage simpler trees, reducing overfitting).min_samples_leaf: Minimum samples required per leaf node (larger values prevent overfitting).max_features: Controls the number of features considered per split ('sqrt'balances model diversity and accuracy).
- Support Vector Regression (SVR)
C: Regularization parameter (higher values allow more flexibility but risk overfitting, while lower values enforce stricter margins).gamma: Controls the amount of influence of each data point with higher resulting in greater influence and complexity (scaleuses feature variance,autouses 1/#features).epsilon: Defines a margin of tolerance for small prediction errors (higher values simplify the model but reduce sensitivity).- RBF kernel was selected to allow for non-linear regression modeling.
- Stochastic Gradient Descent Regressor (SGDRegressor)
alpha: Regularization strength (higher values reduce complexity but risk underfitting, while lower values can overfit).max_iter: Maximum number of iterations for convergence.tol: Stopping criteria (smaller values force additional iterations but increasing computation, larger values result in faster fitting but may stop before best solution is reached).learning_rate: Controls step size per iteration (constantis fixed,optimaladjusts dynamically,invscalingreduces learning rate over time).- Early stopping was enabled to prevent unnecessary training once performance stabilizes because max iterations was set high to ensure convergence.
- XGBoost Regressor
n_estimators: Number of boosting rounds (higher values improve performance but risk overfitting).max_depth: Limits the depth of trees (lower values prevent overfitting, higher values may capture complex relationships).learning_rate: Controls the contribution of each weak learner (lower values require more boosting rounds, higher values can lead to overfitting).subsample: Proportion of data used per boosting round (lower values reduce overfitting by adding randomness).
Hyperparameter Tuning¶
# ---- Hyperparameter Tuning for LASSO ----
start_time = time.time()
lasso_pipeline = Pipeline([
('preprocessor', preprocessor),
('reg', LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100], cv=5, random_state=1234)) # Use LassoCV instead of GridSearch
])
lasso_pipeline.fit(X_train_selected, y_train)
# Get best alpha directly from LassoCV
best_alpha = lasso_pipeline.named_steps['reg'].alpha_
print(f"\nBest Alpha for LASSO: {best_alpha}")
print(f"LASSO tuning time: {(time.time() - start_time)/60:.2f} minutes")
# ---- Hyperparameter Tuning for Random Forest ----
start_time = time.time()
rf_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [10, 20, None],
'min_samples_split': [5, 10],
'min_samples_leaf': [2, 5],
'max_features': ['sqrt']
}
# Use RandomizedSearchCV for efficiency
grid_rf = RandomizedSearchCV(RandomForestRegressor(random_state=1234),
param_distributions=rf_grid, n_iter=7, cv=5,
scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train_selected, y_train)
# Get best RF parameters
best_rf_params = grid_rf.best_params_
print(f"\nBest Parameters for Random Forest: {best_rf_params}")
print(f"RF tuning time: {(time.time() - start_time)/60:.2f} minutes")
# ---- Hyperparameter Tuning for SVR (RBF) ----
# For comparison with SGDRegressor (linear), RBF is computationally too expensive with full dataset
start_time = time.time()
svr_sample = X_train_selected.sample(n=min(5000, len(X_train_selected)), random_state=1234)
y_svr_sample = y_train.loc[svr_sample.index]
param_grid_svr = {
'reg__C': [0.1, 1, 10],
'reg__gamma': ['scale', 'auto'],
'reg__epsilon': [0.1, 0.2, 0.5]
}
svr_pipeline = Pipeline([('preprocessor', preprocessor), ('reg', SVR(kernel='rbf'))])
grid_svr = GridSearchCV(svr_pipeline, param_grid_svr, cv=5, scoring='neg_mean_squared_error', n_jobs=2)
grid_svr.fit(svr_sample, y_svr_sample)
# Extract best parameters
best_C = grid_svr.best_params_['reg__C']
best_epsilon = grid_svr.best_params_['reg__epsilon']
print(f"\nBest SVR Parameters: C={best_C}, epsilon={best_epsilon}, Kernel=rbf")
print(f"SVR tuning time: {(time.time() - start_time)/60:.2f} minutes")
# ---- Hyperparameter Tuning for SGDRegressor----
start_time = time.time()
param_grid_sgd = {
'reg__alpha': [0.0001, 0.001, 0.01, 0.1],
'reg__max_iter': [10000, 20000],
'reg__tol': [1e-2, 1e-3],
'reg__learning_rate': ['constant', 'optimal', 'invscaling']
}
sgd_pipeline = Pipeline([('preprocessor', preprocessor), ('reg', SGDRegressor(random_state=1234, early_stopping=True))])
grid_sgd = GridSearchCV(sgd_pipeline, param_grid_sgd, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_sgd.fit(svr_sample, y_svr_sample) # Use the same sample of 5000 as used for tuning the SVR
best_sgd_params = grid_sgd.best_params_
print(f"\nBest Parameters for SGDRegressor: {best_sgd_params}")
print(f"SGDRegressor tuning time: {(time.time() - start_time)/60:.2f} minutes")
# ---- Hyperparameter Tuning for XGBoost ----
# Preprocessing fixes
# Ensure column names are strings
X_train_selected.columns = X_train_selected.columns.astype(str).str.replace(r"[^\w]", "_", regex=True)
X_test_selected.columns = X_test_selected.columns.astype(str).str.replace(r"[^\w]", "_", regex=True)
# # Ensure y_train is a 1D array
# y_train = y_train.ravel()
start_time = time.time()
xgb_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.7, 1.0]
}
grid_xgb = RandomizedSearchCV(XGBRegressor(tree_method='hist', random_state=1234),
param_distributions=xgb_grid, n_iter=7, cv=5,
scoring='neg_mean_squared_error')
grid_xgb.fit(X_train_selected, y_train)
# Get best XGBoost parameters
best_xgb_params = grid_xgb.best_params_
print(f"\nBest Parameters for XGBoost: {best_xgb_params}")
print(f"XGBoost tuning time: {(time.time() - start_time)/60:.2f} minutes")
Best Alpha for LASSO: 0.0001
LASSO tuning time: 0.97 minutes
Best Parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}
RF tuning time: 1.53 minutes
Best SVR Parameters: C=10, epsilon=0.5, Kernel=rbf
SVR tuning time: 5.26 minutes
Best Parameters for SGDRegressor: {'reg__alpha': 0.001, 'reg__learning_rate': 'invscaling', 'reg__max_iter': 10000, 'reg__tol': 0.001}
SGDRegressor tuning time: 0.12 minutes
Best Parameters for XGBoost: {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.1}
XGBoost tuning time: 2.64 minutes
Hyperparameter Tuning Details¶
# ---- Display LASSO Tuning Results ----
lasso_cv = lasso_pipeline.named_steps['reg'] # Extract the LassoCV model
lasso_results = pd.DataFrame({
'Alpha': lasso_cv.alphas_,
'Mean MSE': lasso_cv.mse_path_.mean(axis=1),
'Std MSE': lasso_cv.mse_path_.std(axis=1)
})
lasso_results.sort_values(by='Mean MSE', ascending=True, inplace=True) # Lower MSE is better
print("\nLASSO Hyperparameter Tuning Results:")
print(lasso_results)
# ---- Display Random Forest Tuning Results ----
rf_results = pd.DataFrame(grid_rf.cv_results_)
rf_results = rf_results[[
'param_n_estimators', 'param_max_depth',
'param_min_samples_split', 'param_min_samples_leaf',
'param_max_features', 'mean_test_score', 'std_test_score'
]]
rf_results.rename(columns={'mean_test_score': 'Mean MSE', 'std_test_score': 'Std MSE'}, inplace=True)
rf_results.sort_values(by='Mean MSE', ascending=False, inplace=True) # Higher neg MSE is better
print("\nRandom Forest Hyperparameter Tuning Results:")
print(rf_results)
# ---- Display SVR Tuning Results ----
svr_results = pd.DataFrame(grid_svr.cv_results_)
svr_results = svr_results[[
'param_reg__C', 'param_reg__gamma', 'param_reg__epsilon',
'mean_test_score', 'std_test_score'
]]
svr_results.rename(columns={'mean_test_score': 'Mean MSE', 'std_test_score': 'Std MSE'}, inplace=True)
svr_results.sort_values(by='Mean MSE', ascending=False, inplace=True)
print("\nSVR Hyperparameter Tuning Results:")
print(svr_results)
# ---- Display SGDRegressor Tuning Results ----
sgd_results = pd.DataFrame(grid_sgd.cv_results_)
sgd_results = sgd_results[[
'param_reg__alpha', 'param_reg__max_iter', 'param_reg__tol', 'param_reg__learning_rate',
'mean_test_score', 'std_test_score'
]]
sgd_results.rename(columns={'mean_test_score': 'Mean MSE', 'std_test_score': 'Std MSE'}, inplace=True)
sgd_results.sort_values(by='Mean MSE', ascending=False, inplace=True)
print("\nSGDRegressor Hyperparameter Tuning Results:")
print(sgd_results)
# ---- Display XGBoost Tuning Results ----
xgb_results = pd.DataFrame(grid_xgb.cv_results_)
xgb_results = xgb_results[[
'param_n_estimators', 'param_max_depth', 'param_learning_rate', 'param_subsample',
'mean_test_score', 'std_test_score'
]]
xgb_results.rename(columns={'mean_test_score': 'Mean MSE', 'std_test_score': 'Std MSE'}, inplace=True)
xgb_results.sort_values(by='Mean MSE', ascending=False, inplace=True)
print("\nXGBoost Hyperparameter Tuning Results:")
print(xgb_results)
LASSO Hyperparameter Tuning Results:
Alpha Mean MSE Std MSE
6 0.0001 4.922157 0.061825
5 0.0010 5.006710 0.065134
4 0.0100 5.466042 0.073354
3 0.1000 6.520046 0.097922
2 1.0000 7.997807 0.105850
0 100.0000 8.937581 0.105310
1 10.0000 8.937581 0.105310
Random Forest Hyperparameter Tuning Results:
param_n_estimators param_max_depth param_min_samples_split \
6 100 None 5
2 100 None 5
3 50 20 10
4 200 20 10
1 50 20 10
5 50 10 10
0 100 10 10
param_min_samples_leaf param_max_features Mean MSE Std MSE
6 2 sqrt -5.586330 0.096752
2 5 sqrt -5.931775 0.117458
3 2 sqrt -6.523843 0.149670
4 2 sqrt -6.540768 0.113634
1 5 sqrt -6.601560 0.130061
5 2 sqrt -7.436161 0.184347
0 2 sqrt -7.469187 0.167242
SVR Hyperparameter Tuning Results:
param_reg__C param_reg__gamma param_reg__epsilon Mean MSE Std MSE
16 10.0 scale 0.5 -5.180576 0.306000
14 10.0 scale 0.2 -5.209867 0.309865
12 10.0 scale 0.1 -5.225968 0.311395
10 1.0 scale 0.5 -5.373068 0.317040
8 1.0 scale 0.2 -5.391152 0.317922
6 1.0 scale 0.1 -5.402822 0.318288
17 10.0 auto 0.5 -6.056064 0.419162
15 10.0 auto 0.2 -6.071371 0.421181
13 10.0 auto 0.1 -6.090874 0.423207
4 0.1 scale 0.5 -6.469129 0.384134
2 0.1 scale 0.2 -6.479029 0.404270
0 0.1 scale 0.1 -6.484694 0.404506
9 1.0 auto 0.2 -7.145159 0.476469
7 1.0 auto 0.1 -7.149003 0.469673
11 1.0 auto 0.5 -7.166944 0.459185
1 0.1 auto 0.1 -8.941278 0.538385
3 0.1 auto 0.2 -9.041736 0.546378
5 0.1 auto 0.5 -9.252338 0.529826
SGDRegressor Hyperparameter Tuning Results:
param_reg__alpha param_reg__max_iter param_reg__tol \
21 0.0010 10000 0.001
23 0.0010 20000 0.001
11 0.0001 20000 0.001
9 0.0001 10000 0.001
8 0.0001 10000 0.010
10 0.0001 20000 0.010
20 0.0010 10000 0.010
22 0.0010 20000 0.010
33 0.0100 10000 0.001
35 0.0100 20000 0.001
34 0.0100 20000 0.010
32 0.0100 10000 0.010
12 0.0010 10000 0.010
14 0.0010 20000 0.010
44 0.1000 10000 0.010
45 0.1000 10000 0.001
46 0.1000 20000 0.010
47 0.1000 20000 0.001
13 0.0010 10000 0.001
15 0.0010 20000 0.001
27 0.0100 20000 0.001
26 0.0100 20000 0.010
25 0.0100 10000 0.001
24 0.0100 10000 0.010
1 0.0001 10000 0.001
3 0.0001 20000 0.001
2 0.0001 20000 0.010
0 0.0001 10000 0.010
38 0.1000 20000 0.010
37 0.1000 10000 0.001
39 0.1000 20000 0.001
36 0.1000 10000 0.010
40 0.1000 10000 0.010
41 0.1000 10000 0.001
42 0.1000 20000 0.010
43 0.1000 20000 0.001
31 0.0100 20000 0.001
29 0.0100 10000 0.001
30 0.0100 20000 0.010
28 0.0100 10000 0.010
16 0.0010 10000 0.010
19 0.0010 20000 0.001
17 0.0010 10000 0.001
18 0.0010 20000 0.010
7 0.0001 20000 0.001
6 0.0001 20000 0.010
5 0.0001 10000 0.001
4 0.0001 10000 0.010
param_reg__learning_rate Mean MSE Std MSE
21 invscaling -5.352532e+00 3.425110e-01
23 invscaling -5.352532e+00 3.425110e-01
11 invscaling -5.362638e+00 3.275472e-01
9 invscaling -5.362638e+00 3.275472e-01
8 invscaling -5.512757e+00 3.668450e-01
10 invscaling -5.512757e+00 3.668450e-01
20 invscaling -5.526833e+00 3.858567e-01
22 invscaling -5.526833e+00 3.858567e-01
33 invscaling -5.581544e+00 3.906860e-01
35 invscaling -5.581544e+00 3.906860e-01
34 invscaling -5.595339e+00 3.842099e-01
32 invscaling -5.595339e+00 3.842099e-01
12 constant -5.839271e+00 4.588282e-01
14 constant -5.839271e+00 4.588282e-01
44 invscaling -5.889720e+00 3.318518e-01
45 invscaling -5.889720e+00 3.318518e-01
46 invscaling -5.889720e+00 3.318518e-01
47 invscaling -5.889720e+00 3.318518e-01
13 constant -5.943402e+00 3.258488e-01
15 constant -5.943402e+00 3.258488e-01
27 constant -5.968213e+00 6.241830e-01
26 constant -5.968213e+00 6.241830e-01
25 constant -5.968213e+00 6.241830e-01
24 constant -5.968213e+00 6.241830e-01
1 constant -5.977302e+00 3.786378e-01
3 constant -5.977302e+00 3.786378e-01
2 constant -6.015074e+00 4.258265e-01
0 constant -6.015074e+00 4.258265e-01
38 constant -6.994141e+00 1.581532e+00
37 constant -6.994141e+00 1.581532e+00
39 constant -6.994141e+00 1.581532e+00
36 constant -6.994141e+00 1.581532e+00
40 optimal -1.078535e+17 1.360745e+17
41 optimal -1.078535e+17 1.360745e+17
42 optimal -1.078535e+17 1.360745e+17
43 optimal -1.078535e+17 1.360745e+17
31 optimal -7.763961e+18 4.336271e+18
29 optimal -7.763961e+18 4.336271e+18
30 optimal -7.763961e+18 4.336271e+18
28 optimal -7.763961e+18 4.336271e+18
16 optimal -1.282457e+20 1.648745e+20
19 optimal -1.282457e+20 1.648745e+20
17 optimal -1.282457e+20 1.648745e+20
18 optimal -1.282457e+20 1.648745e+20
7 optimal -2.021550e+25 2.673950e+25
6 optimal -2.021550e+25 2.673950e+25
5 optimal -2.021550e+25 2.673950e+25
4 optimal -2.021550e+25 2.673950e+25
XGBoost Hyperparameter Tuning Results:
param_n_estimators param_max_depth param_learning_rate param_subsample \
2 100 10 0.10 0.7
1 100 10 0.20 1.0
4 200 3 0.20 1.0
6 50 5 0.20 1.0
3 200 5 0.01 0.7
0 200 3 0.01 0.7
5 100 5 0.01 1.0
Mean MSE Std MSE
2 -4.728530 0.058843
1 -4.731454 0.066470
4 -4.819531 0.070729
6 -4.955254 0.073430
3 -5.723944 0.092798
0 -6.129246 0.096382
5 -6.390031 0.097441
Cross-Validate Regression Models¶
# Clean best parameters to remove the "reg__" prefix
best_sgd_params_cleaned = {k.replace("reg__", ""): v for k, v in best_sgd_params.items()}
# Define best-tuned models with corrected parameters
models = {
"LASSO Regression": Lasso(alpha=best_alpha, random_state=1234),
"Random Forest Regression": RandomForestRegressor(**best_rf_params, random_state=1234),
"SGD Regression": SGDRegressor(**best_sgd_params_cleaned, random_state=1234),
"XGBoost": XGBRegressor(**best_xgb_params, tree_method='hist', random_state=1234)
}
# Define 10-fold cross-validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
# Perform CV for each model and store MSE scores
cv_mse_scores = {}
for name, model in models.items():
mse_scores = -cross_val_score(model, X_train_selected, y_train, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
cv_mse_scores[name] = mse_scores
print(f"\n{name} 10-Fold CV MSE Scores: {mse_scores}")
LASSO Regression 10-Fold CV MSE Scores: [4.8755231 4.90532026 4.95241657 5.02746912 5.00917965 4.81934181 5.11395915 4.68981783 4.88224079 4.81671992] Random Forest Regression 10-Fold CV MSE Scores: [5.50320506 5.59091167 5.5530301 5.72141489 5.57060659 5.42561055 5.64177334 5.39403829 5.54358504 5.51895066] SGD Regression 10-Fold CV MSE Scores: [ 5.83459237 10.60795108 5.64682685 6.5272637 5.57556209 7.6926736 6.05899605 5.56842417 5.27124945 5.39333436] XGBoost 10-Fold CV MSE Scores: [4.68114614 4.74809742 4.71482468 4.81430912 4.78221083 4.59746981 4.86265707 4.57806444 4.70219517 4.63741684]
Fit and Evaluate Regression Models on the Holdout Set¶
start_time_total = time.time() # Track total time
# Define model labels
reg_labels = ['LASSO Regression', 'Random Forest Regression', 'SGD Regression', 'XGBoost']
# Store training and prediction times
model_times = {"Training Time (min)": {}, "Prediction Time (sec)": {}}
# ---- Update pipelines with with best parameters ----
# Update Lasso pipeline
lasso_pipeline = Pipeline([
('preprocessor', preprocessor),
('reg', Lasso(alpha=best_alpha, random_state=1234)) # Use Lasso (not LassoCV)
])
# Update RF model with modified best parameters, set n_estimators manually (error/time tradeoff isn't worth it)
# Remove "reg__" prefix from best parameters
best_rf_params_cleaned = {k.replace("reg__", ""): v for k, v in best_rf_params.items()}
best_rf_params_cleaned['n_estimators'] = 50 # Manually reduce to 50 trees
rf_model = RandomForestRegressor(**best_rf_params_cleaned, random_state=1234) # No pipeline needed
# Update SGDRegressor pipeline
best_sgd_params = {k.replace("reg__", ""): v for k, v in grid_sgd.best_params_.items()}
best_sgd_params['random_state'] = 1234 # Ensure reproducibility
sgd_pipeline = Pipeline([
('preprocessor', preprocessor),
('reg', SGDRegressor(**best_sgd_params))
])
# Update XGB model
xgb_best_params = {k.replace("reg__", ""): v for k, v in best_xgb_params.items()}
xgb_model = XGBRegressor(**xgb_best_params, tree_method='hist', random_state=1234)
# ---- Preprocessing Fixes for XGBoost ----
X_train_selected.columns = X_train_selected.columns.astype(str).str.replace(r"[^\w]", "_", regex=True)
X_test_selected.columns = X_test_selected.columns.astype(str).str.replace(r"[^\w]", "_", regex=True)
# y_train = y_train.ravel() # Ensure y_train is a 1D array
# ---- Fit the tuned models ----
start_time = time.time()
lasso_pipeline.fit(X_train_selected, y_train)
model_times["Training Time (min)"]["LASSO Regression"] = (time.time() - start_time) / 60 # Convert to minutes
start_time = time.time()
rf_model.fit(X_train_selected, y_train)
model_times["Training Time (min)"]["Random Forest Regression"] = (time.time() - start_time) / 60
start_time = time.time()
sgd_pipeline.fit(X_train_selected, y_train)
model_times["Training Time (min)"]["SGD Regression"] = (time.time() - start_time) / 60
start_time = time.time()
xgb_model.fit(X_train_selected, y_train)
model_times["Training Time (min)"]["XGBoost"] = (time.time() - start_time) / 60
# ---- Make predictions ----
start_time = time.time()
y_test_pred_lasso = lasso_pipeline.predict(X_test_selected)
model_times["Prediction Time (sec)"]["LASSO Regression"] = time.time() - start_time
start_time = time.time()
y_test_pred_rf = rf_model.predict(X_test_selected)
model_times["Prediction Time (sec)"]["Random Forest Regression"] = time.time() - start_time
start_time = time.time()
y_test_pred_sgd = sgd_pipeline.predict(X_test_selected)
model_times["Prediction Time (sec)"]["SGD Regression"] = time.time() - start_time
start_time = time.time()
y_test_pred_xgb = xgb_model.predict(X_test_selected)
model_times["Prediction Time (sec)"]["XGBoost"] = time.time() - start_time
# Function for evaluating models
def evaluate_model(y_true, y_pred, model_name):
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print(f"\nPerformance Metrics for {model_name}:")
print(f" Mean Squared Error (MSE): {mse:.4f}")
print(f" Mean Absolute Error (MAE): {mae:.4f}")
print(f" R² Score: {r2:.4f}")
return {"Model": model_name, "MSE": mse, "MAE": mae, "R²": r2}
# Store results
results = []
for y_pred, label in zip([y_test_pred_lasso, y_test_pred_rf, y_test_pred_sgd, y_test_pred_xgb], reg_labels):
results.append(evaluate_model(y_test, y_pred, label))
# Convert to DataFrame for easy comparison
results_df = pd.DataFrame(results)
print("\nFinal Model Comparison:")
print(results_df)
# Print training, prediction and total times
print("\nModel Training Times (in minutes):")
for model, time_taken in model_times["Training Time (min)"].items():
print(f"{model}: {time_taken:.2f} minutes")
print("\nModel Prediction Times (in seconds):")
for model, time_taken in model_times["Prediction Time (sec)"].items():
print(f"{model}: {time_taken:.2f} seconds")
end_time_total = time.time()
print(f"\nTotal Elapsed Time: {(end_time_total - start_time_total)/60:.2f} minutes")
Performance Metrics for LASSO Regression:
Mean Squared Error (MSE): 4.8428
Mean Absolute Error (MAE): 1.6569
R² Score: 0.4500
Performance Metrics for Random Forest Regression:
Mean Squared Error (MSE): 5.4340
Mean Absolute Error (MAE): 1.7723
R² Score: 0.3828
Performance Metrics for SGD Regression:
Mean Squared Error (MSE): 4.9112
Mean Absolute Error (MAE): 1.6579
R² Score: 0.4422
Performance Metrics for XGBoost:
Mean Squared Error (MSE): 4.6518
Mean Absolute Error (MAE): 1.5990
R² Score: 0.4717
Final Model Comparison:
Model MSE MAE R²
0 LASSO Regression 4.842779 1.656895 0.449960
1 Random Forest Regression 5.434037 1.772341 0.382806
2 SGD Regression 4.911203 1.657874 0.442189
3 XGBoost 4.651790 1.598970 0.471653
Model Training Times (in minutes):
LASSO Regression: 0.71 minutes
Random Forest Regression: 0.24 minutes
SGD Regression: 0.16 minutes
XGBoost: 0.07 minutes
Model Prediction Times (in seconds):
LASSO Regression: 0.07 seconds
Random Forest Regression: 0.23 seconds
SGD Regression: 0.07 seconds
XGBoost: 0.25 seconds
Total Elapsed Time: 1.19 minutes
LASSO Regression and XGBoost had the lowest MSE, with LASSO achieving the best overall performance (MSE: 4.84).
Random Forest had the highest MSE (5.43) suggesting it struggled to generalize compared to other models.
SGD Regression performed similarly to LASSO (MSE: 4.91) but was slightly less accurate.
XGBoost had comparable performance to LASSO (MSE: 4.87) but took longer to make predictions.
Random Forest was the slowest model overall with the weakest performance.
Recommendation:
- LASSO Regression is the best choice due to its balance of accuracy, efficiency, and interpretability.
- SGD Regression and XGBoost are good alternatives.
Plot the Residuals¶
# Define regression models & labels
regressors = [lasso_pipeline, rf_model, sgd_pipeline, xgb_model]
reg_labels = ['LASSO Regression', 'Random Forest Regression', 'SGD Regression', 'XGBoost']
# Create subplots (3 rows: Residuals, Predicted vs Actual, Error Distribution)
fig, axes = plt.subplots(3, len(regressors), figsize=(len(regressors) * 5, 15))
for i, (model, label) in enumerate(zip(regressors, reg_labels)):
# Predictions
y_test_pred = model.predict(X_test_selected)
residuals = y_test - y_test_pred # Compute residuals (errors)
# ---- Residual Plot ----
axes[0, i].scatter(y_test_pred, residuals, alpha=0.5)
axes[0, i].axhline(y=0, color="r", linestyle="--") # Reference line at 0
axes[0, i].set_title(f"Residual Plot - {label}")
axes[0, i].set_xlabel("Predicted Values")
axes[0, i].set_ylabel("Residuals")
# ---- Predicted vs. Actual ----
axes[1, i].scatter(y_test, y_test_pred, alpha=0.5)
axes[1, i].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') # Perfect prediction line
axes[1, i].set_title(f"Predicted vs. Actual - {label}")
axes[1, i].set_xlabel("Actual Values")
axes[1, i].set_ylabel("Predicted Values")
# ---- Error Distribution ----
sns.histplot(residuals, bins=30, kde=True, ax=axes[2, i])
axes[2, i].set_title(f"Error Distribution - {label}")
axes[2, i].set_xlabel("Residuals")
plt.tight_layout()
plt.show()
- The residual plots for all models show a pattern rather than random scatter, suggesting that the models are not fully capturing the underlying structure in the data.
- The error distributions are approximately normal.
Statistical Comparison of Models from the Residuals¶
from scipy.stats import ttest_rel, wilcoxon
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import numpy as np
# Compute residuals for each model
lasso_residuals = np.abs(y_test - y_test_pred_lasso)
rf_residuals = np.abs(y_test - y_test_pred_rf)
sgd_residuals = np.abs(y_test - y_test_pred_sgd)
xgb_residuals = np.abs(y_test - y_test_pred_xgb)
# Store in dictionary
residuals_dict = {
"LASSO Regression": lasso_residuals,
"Random Forest Regression": rf_residuals,
"SGD Regression": sgd_residuals,
"XGBoost": xgb_residuals
}
# Combine all residuals into a single array
all_residuals = np.concatenate([residuals_dict[name] for name in model_names])
# Create labels for each model (repeated for each test sample)
model_labels = []
for name in model_names:
model_labels.extend([name] * len(residuals_dict[name]))
# Perform Tukey's HSD test
tukey_results = pairwise_tukeyhsd(all_residuals, model_labels, alpha=0.05)
# Print results
print("\nTukey's HSD Test Results (using holdout residuals):")
print(tukey_results)
Tukey's HSD Test Results (using holdout residuals):
Multiple Comparison of Means - Tukey HSD, FWER=0.05
========================================================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------------------------------------------
LASSO Regression Random Forest Regression 0.1154 0.0 0.078 0.1529 True
LASSO Regression SGD Regression 0.001 0.9999 -0.0365 0.0384 False
LASSO Regression XGBoost -0.0579 0.0004 -0.0954 -0.0205 True
Random Forest Regression SGD Regression -0.1145 0.0 -0.1519 -0.077 True
Random Forest Regression XGBoost -0.1734 0.0 -0.2108 -0.1359 True
SGD Regression XGBoost -0.0589 0.0003 -0.0964 -0.0215 True
----------------------------------------------------------------------------------------
- Tukey’s HSD test found significant differences in residuals across most model pairs.
- The only non-significant difference was between LASSO and SGD Regression, suggesting they perform similarly in terms of error distribution.
Plot Learning Curves¶
from sklearn.model_selection import learning_curve
# Apply fixes: Increase iterations, ensure scaling, and adjust alpha
lasso_pipeline.set_params(reg__max_iter=5000, reg__alpha=0.01)
# Function to plot learning curves
def plot_learning_curve(model, X, y, title):
train_sizes, train_scores, test_scores = learning_curve(
model, X, y, cv=5, scoring="neg_mean_squared_error",
train_sizes=np.linspace(0.1, 1.0, 10))
# Convert scores to positive MSE
train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', label="Training Error")
plt.plot(train_sizes, test_scores_mean, 'o-', label="Validation Error")
plt.xlabel("Training Samples")
plt.ylabel("Mean Squared Error")
plt.title(title)
plt.legend(loc="best")
# Plot for all models
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
models = {
"LASSO Regression": lasso_pipeline,
"Random Forest": rf_model,
"SGD Regression": sgd_pipeline,
"XGBoost": xgb_model
}
for ax, (name, model) in zip(axes.ravel(), models.items()):
plot_learning_curve(model, X_train_selected, y_train, f"Learning Curve - {name}")
plt.sca(ax)
plt.tight_layout()
plt.show()
Each learning curve shows Mean Squared Error (MSE) on both the training set and validation set as the number of training samples increases.
- Random Forest:
- Training error remains consistently low, while validation error gradually decreases but does not converge.
- Large separation between training and validation curves suggest overfitting.
- SGD Regression:
- Training error starts low but increases as more data is used, converging towards validation error.
- Suggests high bias, meaning the model may be underfitting.
- XGBoost:
- Training error starts very low and increases slightly, while validation error decreases and stabilizes.
- Shows some evidence of overfitting but less than Random Forest.
- LASSO Regression:
- Training and validation errors are relatively close, with both stabilizing early.
- Suggests the model may be underfitting.
Interpreting the Lasso Model¶
# Extract the trained LASSO model from the pipeline
lasso_model = lasso_pipeline.named_steps['reg']
# Get feature names from the preprocessor (if applicable)
if 'preprocessor' in lasso_pipeline.named_steps:
feature_names = list(X_train_selected.columns) # If no preprocessing was applied
else:
feature_names = list(X_train_selected.columns) # Use raw feature names
# Get the coefficients
lasso_coefficients = pd.DataFrame({
'Feature': feature_names,
'Coefficient': lasso_model.coef_
})
# Remove zero coefficients
lasso_nonzero = lasso_coefficients[lasso_coefficients['Coefficient'] != 0]
# Count number of selected features
num_nonzero_features = lasso_nonzero.shape[0]
# Get top 20 important features (sorted by absolute coefficient value)
top_20_features = lasso_nonzero.assign(AbsCoefficient=lasso_nonzero['Coefficient'].abs()) \
.sort_values(by='AbsCoefficient', ascending=False) \
.drop(columns=['AbsCoefficient']) \
.head(20)
# Print results
print(f"\nTotal Number of Non-Zero Coefficients: {num_nonzero_features}")
print("\nTop 20 Most Important Features in LASSO:")
print(top_20_features)
Total Number of Non-Zero Coefficients: 924
Top 20 Most Important Features in LASSO:
Feature Coefficient
641 diag_1_731 4.803175
528 diag_1_583 3.958205
833 diag_1_V57 3.850634
126 medical_specialty_Pediatrics_Pulmonology 3.683990
286 diag_1_295 3.349534
287 diag_1_296 2.785223
186 diag_1_161 2.760550
304 diag_1_312 2.756563
221 diag_1_208 2.616360
830 diag_1_V54 2.425186
2297 diag_3_V53 2.379746
281 diag_1_290 2.319377
977 diag_2_312 2.298668
298 diag_1_307 2.281423
285 diag_1_294 2.136586
37 discharge_disposition_id_19 -2.118355
729 diag_1_862 2.034367
2310 diag_3_V72 -1.967630
273 diag_1_282 1.943745
498 diag_1_551 1.823169
Deployment¶
Medication Change Classification Model¶
Use Case:
- This model can help insurance companies or pharmacies predict which patients are likely to have a medication change.
- It can be used to allocate resources efficiently, reduce costs, and identify potential prescribing errors.
Measuring the Model’s Value:
- Monitor operational efficiency: Streamline resource allocation.
Deployment Strategy:
- Processing: Decide if predictions are batch processed (e.g. daily, weekly) or in real-time based on patient updates.
- Environment: What is the architecture of the client's environment and what integrations would be necessary?
Data Collection & Model Updates:
- Quarterly or Yearly Reassessment: Re-evaluate performance using new patient data.
- Feature Drift Analysis: Are some variables becoming less predictive over time?
- New Data Sources: Could lab results or patient adherence behavior improve the model?
Time in Hospital Regression Model¶
Use Case:
- Hospitals and insurance providers can use this model to predict patient length of stay, helping improve bed management, resource allocation, and cost forecasting.
Measuring the Model’s Value:
- Hospital Efficiency: Optimize bed availability and staffing.
- Cost Reduction: Identify high-risk patients earlier to reduce excessive hospital stays.
- Improved Patient Care: More accurate discharge planning leads to better patient outcomes.
Deployment Strategy:
- Insurance Claim Preprocessing: Use predictions for cost estimation.
- Hospital Resource Planning: Forecast future occupancy and adjust staffing levels
- Environment: What is the architecture of the client's environment and what integrations would be necessary?
Data Collection & Model Updates:
- Monitor Model Drift: Are patient demographics or treatments changing?
- New Data: Consider adding seasonal data to improve predictions.
- Regular Recalibration: Re-train the model annually using the latest hospital data.
Appendix: Free-Reign Modeling¶
df = df.copy()
Data Cleaning:¶
Initial data cleaning includes handling NAs, identifying high cardinality columns for frequency encoding, one-hot encoding categorical columns, and dropping any other columns thatr may not contribute to the analysis
# Replace '?' with NaN
df.replace('?', pd.NA, inplace=True)
# Check for columns with missing values
missing_summary = df.isnull().sum()
print(missing_summary[missing_summary > 0]) # Columns with missing data
# Drop columns that may not contribute to analysis
df.drop(['encounter_id', 'patient_nbr','weight','max_glu_serum','A1Cresult','examide','citoglipton'], axis=1, inplace=True)
high_cardinality_columns = [col for col in df.columns if df[col].nunique() > 9]
# Step 1: Create frequency maps for all columns
frequency_maps = {col: df[col].value_counts() for col in high_cardinality_columns}
# Step 2: Apply frequency encoding
for col in high_cardinality_columns:
df[f'{col}_encoded'] = df[col].map(frequency_maps[col])
# Apply encoding with fallback for unseen categories
for col in high_cardinality_columns:
df[f'{col}_encoded'] = df[col].map(frequency_maps[col]).fillna(0)
# Dropping old high cardinality columns
df.drop(high_cardinality_columns, axis=1, inplace=True)
# Identify all new columns that have 'encoded' in their names
high_cardinality_columns = [col for col in df.columns if 'encoded' in col ]
"""or 'change' in col"""
categorical_columns = df.select_dtypes(include='object').columns
categorical_columns = list(categorical_columns[0:23]) + list(categorical_columns[24:26])
print(categorical_columns)
#one-hot encoding to all categorical columns
df_encoded = pd.get_dummies(df.drop(columns=high_cardinality_columns + ['change']),
columns=categorical_columns,
drop_first=False)
df_encoded = pd.concat([df[high_cardinality_columns] , df[['change']], df_encoded], axis=1)
race 2273 weight 98569 payer_code 40256 medical_specialty 49949 diag_1 21 diag_2 358 diag_3 1423 max_glu_serum 96420 A1Cresult 84748 dtype: int64 ['race', 'gender', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'diabetesMed', 'readmitted']
Further cleaning involves factorizing all categorical variables and removing features with a value frequency over 80%¶
I chose to remove any column with one value that dominates the field by more than 80% to get rid of noise and boost model efficiency
## Remove duplicate columns
df_encoded = df_encoded.loc[:, ~df_encoded.T.duplicated()]
# Check for columns with missing values
missing_summary = df_encoded.isnull().sum()
print(missing_summary[missing_summary > 0]) # Columns with missing data
df_encoded = df_encoded.drop(columns=['payer_code_encoded'] + ['medical_specialty_encoded'])
# Factorize all categorical variables
# Select all boolean columns
categorical_columns = list(df_encoded.select_dtypes(include='bool').columns)
# Append 'readmitted' column to the list (if it exists in the DataFrame)
if 'change' in df_encoded.columns:
categorical_columns.append('change')
# Factorize
for col in categorical_columns:
df_encoded[col], mapping = pd.factorize(df_encoded[col])
# print(df_encoded.info())
# print(df_encoded.head())
print(df_encoded['change'])
columns_and_dtypes_df = df_encoded.dtypes.reset_index()
columns_and_dtypes_df.columns = ['Column', 'Data Type']
print(columns_and_dtypes_df)
"""
Left_out = df[['readmitted','Change']]
# Factorize
for col in Left_out:
df_encoded[col], mapping = pd.factorize(df_encoded[col])
"""
# Define threshold
threshold = 0.8
# Compute the most common value frequency per column
highly_constant_columns = [
col for col in df_encoded.columns
if df_encoded[col].value_counts(normalize=True).iloc[0] > threshold
]
# Drop these columns
df_filtered = df_encoded.drop(columns=highly_constant_columns)
# Print removed columns for reference
print("Dropped columns:", highly_constant_columns)
# Display the cleaned dataset
print(df_filtered.shape)
Series([], dtype: int64)
0 0
1 1
2 0
3 1
4 1
..
101761 1
101762 0
101763 1
101764 1
101765 0
Name: change, Length: 101766, dtype: int64
Column Data Type
0 age_encoded int64
1 discharge_disposition_id_encoded int64
2 admission_source_id_encoded int64
3 time_in_hospital_encoded int64
4 num_lab_procedures_encoded int64
.. ... ...
93 diabetesMed_No int64
94 diabetesMed_Yes int64
95 readmitted_<30 int64
96 readmitted_>30 int64
97 readmitted_NO int64
[98 rows x 2 columns]
Dropped columns: ['number_outpatient_encoded', 'number_emergency_encoded', 'race_AfricanAmerican', 'race_Asian', 'race_Hispanic', 'race_Other', 'gender_Unknown/Invalid', 'metformin_Down', 'metformin_No', 'metformin_Steady', 'metformin_Up', 'repaglinide_Down', 'repaglinide_No', 'repaglinide_Steady', 'repaglinide_Up', 'nateglinide_Down', 'nateglinide_No', 'nateglinide_Steady', 'nateglinide_Up', 'chlorpropamide_Down', 'chlorpropamide_No', 'chlorpropamide_Steady', 'chlorpropamide_Up', 'glimepiride_Down', 'glimepiride_No', 'glimepiride_Steady', 'glimepiride_Up', 'acetohexamide_No', 'acetohexamide_Steady', 'glipizide_Down', 'glipizide_No', 'glipizide_Steady', 'glipizide_Up', 'glyburide_Down', 'glyburide_No', 'glyburide_Steady', 'glyburide_Up', 'tolbutamide_No', 'tolbutamide_Steady', 'pioglitazone_Down', 'pioglitazone_No', 'pioglitazone_Steady', 'pioglitazone_Up', 'rosiglitazone_Down', 'rosiglitazone_No', 'rosiglitazone_Steady', 'rosiglitazone_Up', 'acarbose_Down', 'acarbose_No', 'acarbose_Steady', 'acarbose_Up', 'miglitol_Down', 'miglitol_No', 'miglitol_Steady', 'miglitol_Up', 'troglitazone_No', 'troglitazone_Steady', 'tolazamide_No', 'tolazamide_Steady', 'tolazamide_Up', 'insulin_Down', 'insulin_Up', 'glyburide-metformin_Down', 'glyburide-metformin_No', 'glyburide-metformin_Steady', 'glyburide-metformin_Up', 'glipizide-metformin_No', 'glipizide-metformin_Steady', 'glimepiride-pioglitazone_No', 'glimepiride-pioglitazone_Steady', 'metformin-rosiglitazone_No', 'metformin-rosiglitazone_Steady', 'metformin-pioglitazone_No', 'metformin-pioglitazone_Steady', 'readmitted_<30']
(101766, 23)
Next, we will use a random forest model to determine feature importance and further reduce the features of our data and define our reduced dataset to be used for model building¶
from sklearn.ensemble import RandomForestClassifier
# Quick feature importance analysis
X = df_filtered.drop(['change'], axis=1)
y = df_filtered['change']
model = RandomForestClassifier(random_state=42)
model.fit(X, y)
# Feature importance
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False, inplace=True)
for feature,importance in feature_importance.items():
if importance >= .02:
print(f"{feature}: {importance}")
# Save feature names with importance >= 0.02 into a list
important_features = feature_importance[feature_importance >= 0.02].index.tolist()
# Now, `important_features` contains the feature names
print(important_features)
'''
sns.pairplot(df_encoded, vars= important_features, hue='readmitted')
plt.show()
'''
df_reduced = df_filtered[important_features].copy()
df_reduced.loc[:, 'change'] = df_filtered['change']
insulin_No: 0.16046023452653593 insulin_Steady: 0.10500799819552975 diabetesMed_Yes: 0.09611544597905328 diabetesMed_No: 0.09059137219899344 num_lab_procedures_encoded: 0.06511451195921641 num_medications_encoded: 0.060344793813028035 diag_2_encoded: 0.05974984589679944 diag_1_encoded: 0.05973121256117793 diag_3_encoded: 0.05809980873153814 time_in_hospital_encoded: 0.03701168689693641 age_encoded: 0.03474990210108076 number_diagnoses_encoded: 0.02618142970516673 num_procedures: 0.025441949470551433 discharge_disposition_id_encoded: 0.022949599315295326 ['insulin_No', 'insulin_Steady', 'diabetesMed_Yes', 'diabetesMed_No', 'num_lab_procedures_encoded', 'num_medications_encoded', 'diag_2_encoded', 'diag_1_encoded', 'diag_3_encoded', 'time_in_hospital_encoded', 'age_encoded', 'number_diagnoses_encoded', 'num_procedures', 'discharge_disposition_id_encoded']
# visualizations
import matplotlib.pyplot as plt
from IPython.display import Image
# pipelines
from sklearn.pipeline import _name_estimators
from sklearn.pipeline import Pipeline
# data preprocessing, cross-validation, accuracies
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
# individual classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# ensemble classifiers
##(not that we use a user defined class for MajorityVoting)
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
# others
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import clone
Defining Classes¶
class MajorityVoteClassifier(BaseEstimator,
ClassifierMixin):
""" A majority vote ensemble classifier
Parameters
----------
classifiers : array-like, shape = [n_classifiers]
Different classifiers for the ensemble
vote : str, {'classlabel', 'probability'} (default='classlabel')
If 'classlabel' the prediction is based on the argmax of
class labels. Else if 'probability', the argmax of
the sum of probabilities is used to predict the class label
(recommended for calibrated classifiers).
weights : array-like, shape = [n_classifiers], optional (default=None)
If a list of `int` or `float` values are provided, the classifiers
are weighted by importance; Uses uniform weights if `weights=None`.
"""
def __init__(self, classifiers, vote='classlabel', weights=None):
self.classifiers = classifiers
self.named_classifiers = {key: value for key, value
in _name_estimators(classifiers)}
self.vote = vote
self.weights = weights
def fit(self, X, y):
""" Fit classifiers.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_examples, n_features]
Matrix of training examples.
y : array-like, shape = [n_examples]
Vector of target class labels.
Returns
-------
self : object
"""
if self.vote not in ('probability', 'classlabel'):
raise ValueError("vote must be 'probability' or 'classlabel'"
"; got (vote=%r)"
% self.vote)
if self.weights and len(self.weights) != len(self.classifiers):
raise ValueError('Number of classifiers and weights must be equal'
'; got %d weights, %d classifiers'
% (len(self.weights), len(self.classifiers)))
# Use LabelEncoder to ensure class labels start with 0, which
# is important for np.argmax call in self.predict
self.lablenc_ = LabelEncoder()
self.lablenc_.fit(y)
self.classes_ = self.lablenc_.classes_
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
self.classifiers_.append(fitted_clf)
return self
def predict(self, X):
""" Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_examples, n_features]
Matrix of training examples.
Returns
----------
maj_vote : array-like, shape = [n_examples]
Predicted class labels.
"""
if self.vote == 'probability':
maj_vote = np.argmax(self.predict_proba(X), axis=1)
else: # 'classlabel' vote
# Collect results from clf.predict calls
predictions = np.asarray([clf.predict(X)
for clf in self.classifiers_]).T
maj_vote = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
maj_vote = self.lablenc_.inverse_transform(maj_vote)
return maj_vote
def predict_proba(self, X):
""" Predict class probabilities for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_examples, n_features]
Training vectors, where n_examples is the number of examples and
n_features is the number of features.
Returns
----------
avg_proba : array-like, shape = [n_examples, n_classes]
Weighted average probability for each class per example.
"""
probas = np.asarray([clf.predict_proba(X)
for clf in self.classifiers_])
avg_proba = np.average(probas, axis=0, weights=self.weights)
return avg_proba
def get_params(self, deep=True):
""" Get classifier parameter names for GridSearch"""
if not deep:
return super(MajorityVoteClassifier, self).get_params(deep=False)
else:
out = self.named_classifiers.copy()
for name, step in self.named_classifiers.items():
for key, value in step.get_params(deep=True).items():
out['%s__%s' % (name, key)] = value
return out
Splitting Data¶
df_reduced.fillna(0, inplace=True) # Replace NaNs with 0
# create X and y arrays
X = np.array(df_reduced.iloc[:, 1:])
y = np.array(df_reduced.iloc[:, 0])
# create samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=1, stratify=y)
Training Decision Tree and KNN classifiers¶
clf1 = DecisionTreeClassifier(max_depth=2,
criterion='entropy')
clf2 = KNeighborsClassifier(n_neighbors=5,
p=2,
metric='minkowski')
# the KNN classifier is not scale-invariant whereas the decission tree classifier is.
# remember it's a good habit to work with standardized features, so let's use a Pipeline (https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) to standardize the features for KNN
# the StandardScaller method in sklearn.preprocessing module does this for us
clf2_pipeline = Pipeline([['sc', StandardScaler()],
['clf', clf2]])
# define classifier labels
clf_names = ['Decision tree', 'KNN']
# evaluate the model performance for each classifier using 10-fold cross validation on the training data
# note that with the 10-fold validation we don't try to find the optimal combination of hyperparameter values (i.e., use the GridSearchCV() method from sklearn.model_selection module)
# instead, we want to fine-tune the performance given a single set of hyperparameter values
print('10-fold cross validation (CV):\n')
for clf, name in zip([clf1, clf2_pipeline], clf_names):
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
n_jobs=1) # n_jobs = the number of CPUs to use, set to -1 to use all
print("CV accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), name)) # cross_val_score() returns stats(e.g., mean and variance) for accuracy scores
10-fold cross validation (CV): CV accuracy: 0.91 (+/- 0.00) [Decision tree] CV accuracy: 0.91 (+/- 0.00) [KNN]
This shows the the accuracy of both classifiers is the same
Combining Classifiers for Majority Rule Voting and evaluating using 10-fold cv¶
mv_clf = MajorityVoteClassifier(classifiers=[clf1, clf2_pipeline])
clf_names += ['Majority voting']
all_clf = [clf1, clf2_pipeline, mv_clf]
for clf, label in zip(all_clf, clf_names):
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
n_jobs=1)
print("CV accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
CV accuracy: 0.91 (+/- 0.00) [Decision tree] CV accuracy: 0.91 (+/- 0.00) [KNN] CV accuracy: 0.91 (+/- 0.00) [Majority voting]
The majority vote classifier didnt improve the accuracy over the KNN or decision tree
Ensemble Learning and Bagging¶
# ENSEMBLE LEARNING - BAGGING
tree = DecisionTreeClassifier(criterion='entropy',
max_depth=None)
bag = BaggingClassifier(tree,
n_estimators=500,
n_jobs=1)
# performance of the base tree alone
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
% (tree_train, tree_test))
# performance of bagging
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f'
% (bag_train, bag_test))
Decision tree train/test accuracies 1.000/0.888 Bagging train/test accuracies 1.000/0.921
Here, the unpruned decision tree predicst all rtraining class labels correctly, but the generalization does not perform as well as the past models. The bagging accuracy here excels though, most likely due to the multiple instances of the same model being trained.
ENSEMBLE LEARNING - ADABOOST¶
tree = DecisionTreeClassifier(criterion='entropy',
max_depth=1)
ada = AdaBoostClassifier(tree,
n_estimators=500)
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
% (tree_train, tree_test))
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f'
% (ada_train, ada_test))
Decision tree train/test accuracies 0.769/0.767 AdaBoost train/test accuracies 0.915/0.915
Here, the AdaBoost does imporove the test accuracy a little but not as much as the bagged decision tree model
Plotting Model Comparisons¶
# Define model names
models = ['Decision Tree', 'Bagging', 'AdaBoost']
# Define training and testing accuracies (Replace these values with actual results)
train_accuracies = [tree_train, bag_train, ada_train] # Example values
test_accuracies = [tree_test, bag_test, ada_test]
# Set bar positions
x = np.arange(len(models)) # Index positions for models
width = 0.35 # Bar width
# Create the figure and axis
fig, ax = plt.subplots(figsize=(8, 6))
# Plot bars for train and test accuracy
bars_train = ax.bar(x - width/2, train_accuracies, width, label='Train Accuracy', color='blue', alpha=0.7)
bars_test = ax.bar(x + width/2, test_accuracies, width, label='Test Accuracy', color='green', alpha=0.7)
# Add data labels on top of the bars
for bar in bars_train:
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{bar.get_height():.2f}',
ha='center', va='bottom', fontsize=10, fontweight='bold', color='black')
for bar in bars_test:
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{bar.get_height():.2f}',
ha='center', va='bottom', fontsize=10, fontweight='bold', color='black')
# Add labels and title
ax.set_xlabel('Model')
ax.set_ylabel('Accuracy')
ax.set_title('Comparison of Model Accuracies with Data Labels')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
<matplotlib.legend.Legend at 0x30b0310d0>
More Classification¶
raw_url = 'https://raw.githubusercontent.com/kdhenderson/ML7331_team_project/main/data/diabetes%2B130-us%2Bhospitals%2Bfor%2Byears%2B1999-2008/diabetic_data.csv'
df = pd.read_csv(raw_url)
# Replace `?` with NaN for now
df_clean = df.copy()
df_clean.replace('?', np.nan, inplace=True)
# Replace NaN ('?') with 'Unknown' in the specified columns
columns_to_update_1 = ['medical_specialty', 'payer_code', 'race']
df_clean[columns_to_update_1] = df_clean[columns_to_update_1].replace(np.nan, 'Unknown')
# Replace NaN ('?') with 'Unknown' in the specified columns
columns_to_update_2 = ['diag_1', 'diag_2', 'diag_3']
df_clean[columns_to_update_2] = df_clean[columns_to_update_2].replace(np.nan, 'Unknown/None')
# Replace NaN with 'Untested' in the specified columns
columns_to_update_3 = ['max_glu_serum', 'A1Cresult']
df_clean[columns_to_update_3] = df_clean[columns_to_update_3].replace(np.nan, 'Untested')
# Convert categorical variables `patient_nbr`, `admission_type_id`, `discharge_disposition_id`, `admission_source_id`
# from integer to object datatype.
categoricalInt_cols = ['patient_nbr', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']
df_clean[categoricalInt_cols] = df_clean[categoricalInt_cols].astype('category')
df_clean[['patient_nbr', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']].dtypes
# Remove encounter id, examide, citoglipton, weight, and patient_nbr from the dataset.
df_clean.drop(columns=['encounter_id', 'examide', 'citoglipton', 'weight', 'patient_nbr'], inplace=True)
# # Define the correct order for each variable
readmit_order = ['<30', '>30', 'NO']
drug_order = ['No', 'Down', 'Steady', 'Up']
max_glu_serum_order = ['Untested', 'Norm', '>200', '>300']
a1cresult_order = ['Untested', 'Norm', '>7', '>8']
age_order = ['[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)',
'[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)']
# # List of drug-related variables
drug_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'tolazamide',
'pioglitazone', 'rosiglitazone', 'troglitazone', 'acarbose', 'miglitol',
'insulin', 'glyburide-metformin', 'glipizide-metformin',
'metformin-rosiglitazone', 'metformin-pioglitazone', 'glimepiride-pioglitazone']
# # Reorder categories in the DataFrame
df_clean['readmitted'] = pd.Categorical(df_clean['readmitted'], categories=readmit_order, ordered=True)
df_clean['max_glu_serum'] = pd.Categorical(df_clean['max_glu_serum'], categories=max_glu_serum_order, ordered=True)
df_clean['A1Cresult'] = pd.Categorical(df_clean['A1Cresult'], categories=a1cresult_order, ordered=True)
df_clean['age'] = pd.Categorical(df_clean['age'], categories=age_order, ordered=True)
for col in drug_columns:
if col in df_clean.columns:
df_clean[col] = pd.Categorical(df_clean[col], categories=drug_order, ordered=True)
# Preprocess diag_1, diag_2, diag_3 combining all codes with decimals under their integer values
for col in ['diag_1', 'diag_2', 'diag_3']:
df_clean[col] = df_clean[col].str.split('.').str[0] # Drop decimals and digits after
print( df_clean.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 101766 entries, 0 to 101765 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 race 101766 non-null object 1 gender 101766 non-null object 2 age 101766 non-null category 3 admission_type_id 101766 non-null category 4 discharge_disposition_id 101766 non-null category 5 admission_source_id 101766 non-null category 6 time_in_hospital 101766 non-null int64 7 payer_code 101766 non-null object 8 medical_specialty 101766 non-null object 9 num_lab_procedures 101766 non-null int64 10 num_procedures 101766 non-null int64 11 num_medications 101766 non-null int64 12 number_outpatient 101766 non-null int64 13 number_emergency 101766 non-null int64 14 number_inpatient 101766 non-null int64 15 diag_1 101766 non-null object 16 diag_2 101766 non-null object 17 diag_3 101766 non-null object 18 number_diagnoses 101766 non-null int64 19 max_glu_serum 101766 non-null category 20 A1Cresult 101766 non-null category 21 metformin 101766 non-null category 22 repaglinide 101766 non-null category 23 nateglinide 101766 non-null category 24 chlorpropamide 101766 non-null category 25 glimepiride 101766 non-null category 26 acetohexamide 101766 non-null category 27 glipizide 101766 non-null category 28 glyburide 101766 non-null category 29 tolbutamide 101766 non-null category 30 pioglitazone 101766 non-null category 31 rosiglitazone 101766 non-null category 32 acarbose 101766 non-null category 33 miglitol 101766 non-null category 34 troglitazone 101766 non-null category 35 tolazamide 101766 non-null category 36 insulin 101766 non-null category 37 glyburide-metformin 101766 non-null category 38 glipizide-metformin 101766 non-null category 39 glimepiride-pioglitazone 101766 non-null category 40 metformin-rosiglitazone 101766 non-null category 41 metformin-pioglitazone 101766 non-null category 42 change 101766 non-null object 43 diabetesMed 101766 non-null object 44 readmitted 101766 non-null category dtypes: category(28), int64(8), object(9) memory usage: 15.9+ MB None
Check Response Variable¶
print(df_clean['change'].describe())
count 101766 unique 2 top No freq 54755 Name: change, dtype: object
# Find the proportion of each response class.
print(df_clean['change'].value_counts()/df_clean['change'].count())
change No 0.538048 Ch 0.461952 Name: count, dtype: float64
numeric_df = df_clean.select_dtypes(include=[np.number])
numeric_df
| time_in_hospital | num_lab_procedures | num_procedures | num_medications | number_outpatient | number_emergency | number_inpatient | number_diagnoses | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 41 | 0 | 1 | 0 | 0 | 0 | 1 |
| 1 | 3 | 59 | 0 | 18 | 0 | 0 | 0 | 9 |
| 2 | 2 | 11 | 5 | 13 | 2 | 0 | 1 | 6 |
| 3 | 2 | 44 | 1 | 16 | 0 | 0 | 0 | 7 |
| 4 | 1 | 51 | 0 | 8 | 0 | 0 | 0 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 101761 | 3 | 51 | 0 | 16 | 0 | 0 | 0 | 9 |
| 101762 | 5 | 33 | 3 | 18 | 0 | 0 | 1 | 9 |
| 101763 | 1 | 53 | 0 | 9 | 1 | 0 | 0 | 13 |
| 101764 | 10 | 45 | 2 | 21 | 0 | 0 | 1 | 9 |
| 101765 | 6 | 13 | 3 | 3 | 0 | 0 | 0 | 9 |
101766 rows × 8 columns
# Summary Statistics
description = numeric_df.describe()
mode = numeric_df.mode(numeric_only = True).iloc[0]
variances = numeric_df.var(numeric_only = True)
ranges = numeric_df.max(numeric_only = True) - numeric_df.min(numeric_only = True)
kurt = numeric_df.kurt(numeric_only = True)
skew = numeric_df.skew(numeric_only = True)
summary_df = pd.DataFrame({
'Count': description.loc['count'],
'Mean': description.loc['mean'],
'Std': description.loc['std'],
'Variance': variances,
'Min': description.loc['min'],
'25%': description.loc['25%'],
'Median': description.loc['50%'],
'75%': description.loc['75%'],
'Max': description.loc['max'],
'Mode': mode,
'Range': ranges,
'Kurtosis': kurt,
'Skew': skew
})
numeric_summary_df = summary_df[['Count', 'Mean', 'Std', 'Variance', 'Min', '25%', 'Median', '75%', 'Max', 'Mode', 'Range', 'Kurtosis', 'Skew']]
numeric_summary_df.reset_index(inplace = True)
numeric_summary_df.rename(columns={'index': 'Variable'}, inplace = True)
# Display the final DataFrame
numeric_summary_df
| Variable | Count | Mean | Std | Variance | Min | 25% | Median | 75% | Max | Mode | Range | Kurtosis | Skew | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | time_in_hospital | 101766.0 | 4.395987 | 2.985108 | 8.910868 | 1.0 | 2.0 | 4.0 | 6.0 | 14.0 | 3 | 13 | 0.850251 | 1.133999 |
| 1 | num_lab_procedures | 101766.0 | 43.095641 | 19.674362 | 387.080530 | 1.0 | 31.0 | 44.0 | 57.0 | 132.0 | 1 | 131 | -0.245074 | -0.236544 |
| 2 | num_procedures | 101766.0 | 1.339730 | 1.705807 | 2.909777 | 0.0 | 0.0 | 1.0 | 2.0 | 6.0 | 0 | 6 | 0.857110 | 1.316415 |
| 3 | num_medications | 101766.0 | 16.021844 | 8.127566 | 66.057332 | 1.0 | 10.0 | 15.0 | 20.0 | 81.0 | 13 | 80 | 3.468155 | 1.326672 |
| 4 | number_outpatient | 101766.0 | 0.369357 | 1.267265 | 1.605961 | 0.0 | 0.0 | 0.0 | 0.0 | 42.0 | 0 | 42 | 147.907736 | 8.832959 |
| 5 | number_emergency | 101766.0 | 0.197836 | 0.930472 | 0.865779 | 0.0 | 0.0 | 0.0 | 0.0 | 76.0 | 0 | 76 | 1191.686726 | 22.855582 |
| 6 | number_inpatient | 101766.0 | 0.635566 | 1.262863 | 1.594824 | 0.0 | 0.0 | 0.0 | 1.0 | 21.0 | 0 | 21 | 20.719397 | 3.614139 |
| 7 | number_diagnoses | 101766.0 | 7.422607 | 1.933600 | 3.738810 | 1.0 | 6.0 | 8.0 | 9.0 | 16.0 | 9 | 15 | -0.079056 | -0.876746 |
def create_numeric_variable_grid(n_variables, n_cols=3, figsize=(16, 4)):
"""
Creates a dynamic grid layout for subplots.
Parameters:
- n_variables: Total number of variables to visualize.
- n_cols: Number of columns in the subplot grid.
- figsize: Tuple representing the figure size per row.
Returns:
- fig, axes: Matplotlib figure and flattened axes array.
"""
# Calculate the number of rows needed
n_rows = (n_variables + n_cols - 1) // n_cols # Ceiling division
# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(figsize[0], figsize[1] * n_rows))
axes = axes.flatten() # Flatten axes for easier iteration
# Turn off unused subplots
for ax in axes[n_variables:]:
ax.axis("off") # Hide unused axes
return fig, axes
def ecdf(data, axes):
"""
ECDF plot with percentile markers, confidence intervals, and summary metrics.
"""
# Sort data and compute ECDF
x = np.sort(data)
y = np.arange(1, len(x) + 1) / len(x)
# Plot ECDF
axes.step(x, y, where="post", color="blue", label="ECDF")
# Highlight percentiles
percentiles = [0.25, 0.5, 0.75]
percentile_values = np.percentile(data, [25, 50, 75])
for p, value in zip(percentiles, percentile_values):
axes.axhline(p, linestyle="--", color="gray", alpha=0.7)
axes.axvline(value, linestyle="--", color="gray", alpha=0.7)
axes.text(value, p, f"{value:.2f}", fontsize=9, verticalalignment="bottom")
# Add confidence intervals
n = len(data)
ci_band = 1.36 / np.sqrt(n) # Approximation for 95% CI
axes.fill_between(x, y - ci_band, y + ci_band, color="gray", alpha=0.2, label="95% CI")
# Annotate summary statistics
mean, median = np.mean(data), np.median(data)
axes.text(0.05, 0.9, f"Mean: {mean:.2f}\nMedian: {median:.2f}",
transform=axes.transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
# Add labels and title
axes.set_title(f"ECDF: {column}", fontsize=12)
axes.set_xlabel("Value", fontsize=10)
axes.set_ylabel("Cumulative Probability", fontsize=10)
axes.legend()
fig, axes = create_numeric_variable_grid(len(numeric_df.columns))
for i, column in enumerate(numeric_df.columns):
ecdf(numeric_df[column].dropna(), axes[i])
plt.tight_layout()
plt.show()
vis_df = df_clean.select_dtypes(include=['object', 'category'])
vis_df = vis_df.drop(['medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'admission_type_id', 'max_glu_serum', 'A1Cresult'], axis=1)
vis_df.columns
Index(['race', 'gender', 'age', 'discharge_disposition_id',
'admission_source_id', 'payer_code', 'metformin', 'repaglinide',
'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
'insulin', 'glyburide-metformin', 'glipizide-metformin',
'glimepiride-pioglitazone', 'metformin-rosiglitazone',
'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
dtype='object')
response = 'change' # Attribute of interest
other_columns = [col for col in vis_df.columns if col != response] # Exclude target
crosstab_results = {}
for column in other_columns:
crosstab_results[column] = pd.crosstab(vis_df[response], vis_df[column])
def create_heatmaps(crosstab_results, response, normalization='none', n_cols=4, cmap="plasma"):
"""
Creates a grid of heatmaps for given crosstab results.
Parameters:
crosstab_results (dict): Dictionary of crosstabs for each feature.
response (str): Name of the response variable.
normalization (str): Normalization type ('row', 'column', 'overall').
n_cols (int): Number of columns in the subplot grid.
cmap (str): Colormap for the heatmaps.
"""
n_features = len(crosstab_results) # Total features to plot
n_rows = -(-n_features // n_cols) # Compute rows by rounding up
# Create the figure and axes for subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows), sharex=False, sharey=False)
axes = np.ravel(axes) # Flatten axes for consistent 1D handling
# Loop through crosstabs and create heatmaps
for i, (col, crosstab) in enumerate(crosstab_results.items()):
# Normalize the crosstab based on the selected normalization
if normalization == 'none':
normalized_crosstab = crosstab
elif normalization == 'row':
normalized_crosstab = crosstab.div(crosstab.sum(axis=1), axis=0) # Normalize by row
elif normalization == 'column':
normalized_crosstab = crosstab.div(crosstab.sum(axis=0), axis=1) # Normalize by column
elif normalization == 'overall':
normalized_crosstab = crosstab / crosstab.values.sum() # Normalize overall
else:
raise ValueError("Invalid normalization type. Choose 'none', row', 'column', or 'overall'.")
# Plot heatmap
sns.heatmap(normalized_crosstab, annot=True, fmt=".2f" if normalization != 'none' else "d", cmap=cmap, cbar=False, ax=axes[i])
axes[i].set_title(f"Crosstab of {response} vs {col}")
axes[i].set_xlabel(col)
axes[i].set_ylabel(response)
# Remove unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout for better spacing
plt.tight_layout()
plt.show()
# create_heatmaps(crosstab_results, response=response, normalization='none', n_cols=4, cmap="plasma") # Crosstab Heatmap: No Normalization, Raw Counts
# create_heatmaps(crosstab_results, response=response, normalization='row', n_cols=4, cmap="plasma") # Crosstab Heatmap: No Normalization, Proportion of Factor Levels within the Response Variable
# create_heatmaps(crosstab_results, response=response, normalization='column', n_cols=4, cmap="plasma") # Crosstab Heatmap: Column Normalization, Proportion of Factor Levels within the Predictor Variable
create_heatmaps(crosstab_results, response=response, normalization='overall', n_cols=3, cmap="plasma") # Crosstab Heatmap: Overall Normalization, Proportion of Total Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#One Hot Encoding
df_encoded = df_clean.copy()
# Split data before encoding (Prevents Data Leakage)
X_train, X_test, y_train, y_test = train_test_split(df_encoded.drop(columns=['change']),
df_encoded['change'],
test_size=0.2, stratify=df_encoded['change'], random_state=1234)
# Encode `change`
y_train = y_train.map({"No": 0, "Ch": 1})
y_test = y_test.map({"No": 0, "Ch": 1})
# Apply frequency encoding only on high-cardinality categorical variables
high_cardinality_cols = ['medical_specialty', 'payer_code', 'diag_1', 'diag_2', 'diag_3']
for col in high_cardinality_cols:
if col in X_train.columns:
freq_map = X_train[col].value_counts().to_dict() # Train set only
X_train[col] = X_train[col].map(freq_map)
X_test[col] = X_test[col].map(freq_map).fillna(0) # Fill unseen categories with 0
# Apply one-hot encoding to remaining categorical variables
categoric_columns = X_train.select_dtypes(include=['object', 'category']).columns
X_train = pd.get_dummies(X_train, columns=categoric_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categoric_columns, drop_first=True)
# Convert boolean values to 0/1
X_train = X_train.astype(int)
X_test = X_test.astype(int)
# Standardize the dataset
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
# Identify numeric features
binary_features = [col for col in X_train.columns if set(X_train[col].unique()) == {0, 1}]
numeric_features = list(set(X_train.columns) - set(binary_features))
# Apply Scaling to numeric & frequency-encoded features
# scaler = StandardScaler()
# scaler = MinMaxScaler()
scaler = RobustScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
print("Binary Features:", binary_features)
print("Numeric + Frequency-Encoded Features:", numeric_features)
# Final check
print(X_train.head(), X_test.head())
Binary Features: ['race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'race_Unknown', 'gender_Male', 'gender_Unknown/Invalid', 'age_[10-20)', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'age_[90-100)', 'admission_type_id_2', 'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6', 'admission_type_id_7', 'admission_type_id_8', 'discharge_disposition_id_2', 'discharge_disposition_id_3', 'discharge_disposition_id_4', 'discharge_disposition_id_5', 'discharge_disposition_id_6', 'discharge_disposition_id_7', 'discharge_disposition_id_8', 'discharge_disposition_id_9', 'discharge_disposition_id_10', 'discharge_disposition_id_11', 'discharge_disposition_id_12', 'discharge_disposition_id_13', 'discharge_disposition_id_14', 'discharge_disposition_id_15', 'discharge_disposition_id_16', 'discharge_disposition_id_17', 'discharge_disposition_id_18', 'discharge_disposition_id_19', 'discharge_disposition_id_20', 'discharge_disposition_id_22', 'discharge_disposition_id_23', 'discharge_disposition_id_24', 'discharge_disposition_id_25', 'discharge_disposition_id_27', 'discharge_disposition_id_28', 'admission_source_id_2', 'admission_source_id_3', 'admission_source_id_4', 'admission_source_id_5', 'admission_source_id_6', 'admission_source_id_7', 'admission_source_id_8', 'admission_source_id_9', 'admission_source_id_10', 'admission_source_id_11', 'admission_source_id_14', 'admission_source_id_17', 'admission_source_id_20', 'admission_source_id_22', 'admission_source_id_25', 'max_glu_serum_Norm', 'max_glu_serum_>200', 'max_glu_serum_>300', 'A1Cresult_Norm', 'A1Cresult_>7', 'A1Cresult_>8', 'metformin_Down', 'metformin_Steady', 'metformin_Up', 'repaglinide_Down', 'repaglinide_Steady', 'repaglinide_Up', 'nateglinide_Down', 'nateglinide_Steady', 'nateglinide_Up', 'chlorpropamide_Steady', 'chlorpropamide_Up', 'glimepiride_Down', 'glimepiride_Steady', 'glimepiride_Up', 'acetohexamide_Steady', 'glipizide_Down', 'glipizide_Steady', 'glipizide_Up', 'glyburide_Down', 'glyburide_Steady', 'glyburide_Up', 'tolbutamide_Steady', 'pioglitazone_Down', 'pioglitazone_Steady', 'pioglitazone_Up', 'rosiglitazone_Down', 'rosiglitazone_Steady', 'rosiglitazone_Up', 'acarbose_Down', 'acarbose_Steady', 'acarbose_Up', 'miglitol_Down', 'miglitol_Steady', 'miglitol_Up', 'troglitazone_Steady', 'tolazamide_Steady', 'tolazamide_Up', 'insulin_Down', 'insulin_Steady', 'insulin_Up', 'glyburide-metformin_Down', 'glyburide-metformin_Steady', 'glyburide-metformin_Up', 'glipizide-metformin_Steady', 'metformin-rosiglitazone_Steady', 'metformin-pioglitazone_Steady', 'diabetesMed_Yes', 'readmitted_>30', 'readmitted_NO']
Numeric + Frequency-Encoded Features: ['number_outpatient', 'troglitazone_Up', 'metformin-rosiglitazone_Down', 'tolazamide_Down', 'diag_2', 'acetohexamide_Up', 'tolbutamide_Down', 'troglitazone_Down', 'metformin-pioglitazone_Up', 'glipizide-metformin_Up', 'diag_1', 'acetohexamide_Down', 'num_procedures', 'glimepiride-pioglitazone_Down', 'admission_source_id_13', 'metformin-rosiglitazone_Up', 'glimepiride-pioglitazone_Steady', 'num_lab_procedures', 'chlorpropamide_Down', 'payer_code', 'metformin-pioglitazone_Down', 'glipizide-metformin_Down', 'number_diagnoses', 'number_emergency', 'glimepiride-pioglitazone_Up', 'tolbutamide_Up', 'number_inpatient', 'time_in_hospital', 'medical_specialty', 'num_medications', 'diag_3']
time_in_hospital payer_code medical_specialty num_lab_procedures \
67719 -0.50 -0.815483 0.829053 0.000000
90739 -0.50 0.000000 0.829053 0.923077
45794 0.50 0.000000 0.829053 0.115385
73665 0.25 0.000000 -0.332021 -0.153846
16362 0.75 0.230783 -0.170947 -0.269231
num_procedures num_medications number_outpatient number_emergency \
67719 -0.5 0.3 0.0 0.0
90739 1.0 1.2 0.0 0.0
45794 0.0 -0.2 0.0 0.0
73665 -0.5 0.0 0.0 0.0
16362 1.0 -0.2 0.0 0.0
number_inpatient diag_1 ... glimepiride-pioglitazone_Up \
67719 1.0 0.000000 ... 0.0
90739 0.0 0.084995 ... 0.0
45794 0.0 1.454704 ... 0.0
73665 2.0 1.454704 ... 0.0
16362 0.0 -0.403987 ... 0.0
metformin-rosiglitazone_Down metformin-rosiglitazone_Steady \
67719 0.0 0
90739 0.0 0
45794 0.0 0
73665 0.0 0
16362 0.0 0
metformin-rosiglitazone_Up metformin-pioglitazone_Down \
67719 0.0 0.0
90739 0.0 0.0
45794 0.0 0.0
73665 0.0 0.0
16362 0.0 0.0
metformin-pioglitazone_Steady metformin-pioglitazone_Up \
67719 0 0.0
90739 0 0.0
45794 0 0.0
73665 0 0.0
16362 0 0.0
diabetesMed_Yes readmitted_>30 readmitted_NO
67719 1 1 0
90739 1 0 1
45794 1 0 1
73665 0 0 1
16362 0 0 1
[5 rows x 149 columns] time_in_hospital payer_code medical_specialty num_lab_procedures \
35098 0.25 0.000000 0.000000 0.269231
71050 2.00 0.000000 0.829053 -0.423077
23862 -0.25 0.230783 0.829053 -0.576923
206 -0.50 0.230783 0.000000 0.307692
34667 0.00 0.230783 0.000000 0.692308
num_procedures num_medications number_outpatient number_emergency \
35098 -0.5 -0.7 0.0 0.0
71050 0.5 1.5 18.0 0.0
23862 0.0 0.1 0.0 0.0
206 -0.5 -0.3 0.0 0.0
34667 0.0 0.2 0.0 0.0
number_inpatient diag_1 ... glimepiride-pioglitazone_Up \
35098 0.0 -0.289262 ... 0.0
71050 2.0 -0.341028 ... 0.0
23862 0.0 -0.381602 ... 0.0
206 0.0 0.643931 ... 0.0
34667 3.0 -0.171039 ... 0.0
metformin-rosiglitazone_Down metformin-rosiglitazone_Steady \
35098 0.0 0
71050 0.0 0
23862 0.0 0
206 0.0 0
34667 0.0 0
metformin-rosiglitazone_Up metformin-pioglitazone_Down \
35098 0.0 0.0
71050 0.0 0.0
23862 0.0 0.0
206 0.0 0.0
34667 0.0 0.0
metformin-pioglitazone_Steady metformin-pioglitazone_Up \
35098 0 0.0
71050 0 0.0
23862 0 0.0
206 0 0.0
34667 0 0.0
diabetesMed_Yes readmitted_>30 readmitted_NO
35098 0 0 1
71050 1 1 0
23862 1 0 1
206 1 1 0
34667 1 1 0
[5 rows x 149 columns]
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
# Try different values of n_components
n_components_range = range(1, 100, 1) # From 1 to 100, step 1
explained_variance = []
for n in n_components_range:
svd = TruncatedSVD(n_components=n, random_state=1234)
svd.fit(X_train)
explained_variance.append(sum(svd.explained_variance_ratio_))
# Plot variance explained
plt.plot(n_components_range, explained_variance, marker='o')
plt.axhline(y=0.95, color='r', linestyle='--', label="95% variance threshold")
plt.xlabel("Number of Components")
plt.ylabel("Explained Variance Ratio")
plt.title("Choosing Optimal `n_components` for Truncated SVD")
plt.legend()
plt.show()
from sklearn.decomposition import TruncatedSVD
# Define the number of components (From the previous graph, we can see that 37 is the threshold value to achieve 95% variance explained.)
n_components = 37
svd = TruncatedSVD(n_components=n_components, random_state=1234)
# Apply Truncated SVD on training and test data
X_train_reduced = pd.DataFrame(svd.fit_transform(X_train), index=X_train.index)
X_test_reduced = pd.DataFrame(svd.transform(X_test), index=X_test.index)
# Print explained variance ratio
print(f"Explained Variance Ratio (First {n_components} Components): {sum(svd.explained_variance_ratio_):.4f}")
Explained Variance Ratio (First 37 Components): 0.9521
Logistic Regression¶
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Initialize Logistic Regression with balanced class weights
log_reg = LogisticRegression(max_iter=500, solver='lbfgs')
# Use Stratified K-Fold Cross-Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
# Perform cross-validation and calculate accuracy
cv_scores = cross_val_score(log_reg, X_train_reduced, y_train, cv=cv, scoring='accuracy')
# Print results
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
Mean Accuracy: 0.9582 Standard Deviation: 0.0015
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
# Initialize Logistic Regression with balanced class weights
log_reg = LogisticRegression(max_iter=500, solver='lbfgs')
# Use Stratified K-Fold Cross-Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
# Perform cross-validation and calculate accuracy
cv_scores = cross_val_score(log_reg, X_train_reduced, y_train, cv=cv, scoring='accuracy')
# Print results
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
Mean Accuracy: 0.9582 Standard Deviation: 0.0015
XGBoost (Extreme Gradient Boosting): Classification model that uses gradient boosting with decision tree to iteratively minimize errors.¶
from xgboost import XGBClassifier # type: ignore
from sklearn.metrics import classification_report, roc_auc_score
# Default XGBoost Model
xgb_default = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1234)
xgb_default.fit(X_train_reduced, y_train)
# Predictions
y_pred_xgboost = xgb_default.predict(X_test_reduced)
# Evaluate Performance
print("Baseline XGBoost Model:")
print(classification_report(y_test, y_pred_xgboost))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_xgboost))
Baseline XGBoost Model:
precision recall f1-score support
0 0.91 0.97 0.94 10951
1 0.96 0.89 0.93 9403
accuracy 0.94 20354
macro avg 0.94 0.93 0.93 20354
weighted avg 0.94 0.94 0.93 20354
ROC-AUC Score: 0.9322294181520919
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Define Hyperparameter Grid
param_dist = {
'n_estimators': [100, 300, 500, 700, 1000],
'learning_rate': np.linspace(0.01, 0.3, 10),
'max_depth': [3, 5, 7, 9],
'min_child_weight': [1, 3, 5, 7],
'subsample': [0.5, 0.7, 1.0],
'colsample_bytree': [0.5, 0.7, 1.0],
'gamma': [0, 0.1, 0.2, 0.5, 1],
'lambda': [0, 1, 5, 10],
'alpha': [0, 1, 5, 10]
}
# Initialize XGBoost Model
xgb_model = XGBClassifier(random_state=1234)
# Randomized Search with 5-fold Cross Validation
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist,
n_iter=30, scoring='roc_auc', cv=5,
n_jobs=-1, verbose=2, random_state=1234)
# Fit Model
random_search.fit(X_train_reduced, y_train)
# Best Parameters
print("Best Parameters:", random_search.best_params_)
Fitting 5 folds for each of 30 candidates, totalling 150 fits [CV] END alpha=10, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time= 2.9s [CV] END alpha=1, colsample_bytree=0.7, gamma=0.2, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=100, subsample=1.0; total time= 1.5s [CV] END alpha=10, colsample_bytree=0.7, gamma=0.5, lambda=5, learning_rate=0.042222222222222223, max_depth=9, min_child_weight=3, n_estimators=500, subsample=0.5; total time= 5.6s [CV] END alpha=0, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.3, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=1.0; total time= 5.7s [CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=1, learning_rate=0.23555555555555557, max_depth=3, min_child_weight=5, n_estimators=1000, subsample=0.5; total time= 5.2s
/opt/anaconda3/envs/ML7331/lib/python3.12/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak. warnings.warn(
[CV] END alpha=1, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time= 2.1s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time= 2.1s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.5, lambda=5, learning_rate=0.042222222222222223, max_depth=9, min_child_weight=3, n_estimators=500, subsample=0.5; total time= 5.6s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.20333333333333334, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=0.5; total time= 7.0s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.1, lambda=0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=3, n_estimators=300, subsample=0.7; total time= 3.1s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=3, n_estimators=1000, subsample=0.7; total time= 5.0s
[CV] END alpha=1, colsample_bytree=0.7, gamma=1, lambda=10, learning_rate=0.10666666666666666, max_depth=9, min_child_weight=5, n_estimators=500, subsample=0.7; total time= 5.5s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time= 2.8s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time= 2.0s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.5; total time= 5.5s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.3, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=1.0; total time= 5.6s
[CV] END alpha=0, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 2.1s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.1388888888888889, max_depth=3, min_child_weight=1, n_estimators=1000, subsample=0.5; total time= 4.9s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.2, lambda=5, learning_rate=0.1388888888888889, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.7; total time= 7.8s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time= 2.0s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=5, n_estimators=500, subsample=1.0; total time= 2.1s
[CV] END alpha=1, colsample_bytree=0.7, gamma=0.2, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=100, subsample=1.0; total time= 1.5s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.20333333333333334, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=0.5; total time= 7.0s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=1, learning_rate=0.23555555555555557, max_depth=3, min_child_weight=5, n_estimators=1000, subsample=0.5; total time= 5.0s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.1388888888888889, max_depth=3, min_child_weight=1, n_estimators=1000, subsample=0.5; total time= 5.0s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.5, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=7, n_estimators=300, subsample=0.5; total time= 1.4s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=1, n_estimators=300, subsample=1.0; total time= 1.5s
[CV] END alpha=1, colsample_bytree=0.7, gamma=1, lambda=10, learning_rate=0.10666666666666666, max_depth=9, min_child_weight=5, n_estimators=500, subsample=0.7; total time= 5.4s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.5; total time= 0.7s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.5; total time= 7.8s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time= 2.9s
[CV] END alpha=1, colsample_bytree=0.7, gamma=0.2, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=100, subsample=1.0; total time= 1.5s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.5, lambda=5, learning_rate=0.042222222222222223, max_depth=9, min_child_weight=3, n_estimators=500, subsample=0.5; total time= 5.7s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.3, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=1.0; total time= 5.8s
[CV] END alpha=0, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 2.2s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.1388888888888889, max_depth=3, min_child_weight=1, n_estimators=1000, subsample=0.5; total time= 5.0s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.2, lambda=5, learning_rate=0.1388888888888889, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.7; total time= 7.9s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.5; total time= 0.7s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.5; total time= 7.9s
[CV] END alpha=1, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=1, n_estimators=300, subsample=0.5; total time= 1.7s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.5, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=7, n_estimators=300, subsample=0.5; total time= 1.4s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.2, lambda=5, learning_rate=0.1388888888888889, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.7; total time= 7.7s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.5; total time= 0.7s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.5; total time= 7.8s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0.2, lambda=1, learning_rate=0.2677777777777778, max_depth=7, min_child_weight=7, n_estimators=500, subsample=0.5; total time= 4.6s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time= 2.8s
[CV] END alpha=1, colsample_bytree=0.7, gamma=0.2, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=100, subsample=1.0; total time= 1.5s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.5, lambda=5, learning_rate=0.042222222222222223, max_depth=9, min_child_weight=3, n_estimators=500, subsample=0.5; total time= 5.6s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.20333333333333334, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=0.5; total time= 7.0s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.1, lambda=0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=3, n_estimators=300, subsample=0.7; total time= 3.0s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=3, n_estimators=1000, subsample=0.7; total time= 5.0s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=1, n_estimators=300, subsample=1.0; total time= 1.4s
[CV] END alpha=10, colsample_bytree=1.0, gamma=1, lambda=10, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=700, subsample=1.0; total time= 4.7s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.5; total time= 0.7s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.5; total time= 7.8s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0.2, lambda=1, learning_rate=0.2677777777777778, max_depth=7, min_child_weight=7, n_estimators=500, subsample=0.5; total time= 4.5s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 1.6s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.1711111111111111, max_depth=9, min_child_weight=1, n_estimators=700, subsample=1.0; total time= 5.6s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.20333333333333334, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=0.5; total time= 6.8s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=1, learning_rate=0.23555555555555557, max_depth=3, min_child_weight=5, n_estimators=1000, subsample=0.5; total time= 4.9s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.1, lambda=0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=3, n_estimators=300, subsample=0.7; total time= 3.0s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.5, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=7, n_estimators=300, subsample=0.5; total time= 1.5s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.5, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=7, n_estimators=300, subsample=0.5; total time= 1.5s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.2, lambda=5, learning_rate=0.1388888888888889, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.7; total time= 7.9s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.7; total time= 5.6s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=0, learning_rate=0.2677777777777778, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.2, lambda=5, learning_rate=0.10666666666666666, max_depth=5, min_child_weight=7, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.2, lambda=5, learning_rate=0.10666666666666666, max_depth=5, min_child_weight=7, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0.2, lambda=1, learning_rate=0.2677777777777778, max_depth=7, min_child_weight=7, n_estimators=500, subsample=0.5; total time= 4.4s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=1, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=100, subsample=0.5; total time= 1.6s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.1711111111111111, max_depth=9, min_child_weight=1, n_estimators=700, subsample=1.0; total time= 5.5s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.5; total time= 5.5s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=1, learning_rate=0.23555555555555557, max_depth=3, min_child_weight=5, n_estimators=1000, subsample=0.5; total time= 5.0s
[CV] END alpha=0, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 2.1s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.1388888888888889, max_depth=3, min_child_weight=1, n_estimators=1000, subsample=0.5; total time= 4.9s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.2, lambda=5, learning_rate=0.1388888888888889, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.7; total time= 7.9s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.5; total time= 0.6s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.7; total time= 5.8s
[CV] END alpha=10, colsample_bytree=0.7, gamma=1, lambda=1, learning_rate=0.1711111111111111, max_depth=3, min_child_weight=3, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=1, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=1, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=1, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=7, min_child_weight=7, n_estimators=300, subsample=0.7; total time= 3.1s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 1.6s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=1, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=100, subsample=0.5; total time= 1.4s
[CV] END alpha=10, colsample_bytree=0.5, gamma=0.2, lambda=0, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=500, subsample=1.0; total time= 2.8s
[CV] END alpha=1, colsample_bytree=0.7, gamma=0.2, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=100, subsample=1.0; total time= 1.5s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.5, lambda=5, learning_rate=0.042222222222222223, max_depth=9, min_child_weight=3, n_estimators=500, subsample=0.5; total time= 5.6s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.20333333333333334, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=0.5; total time= 7.1s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.1, lambda=0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=3, n_estimators=300, subsample=0.7; total time= 3.0s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=3, n_estimators=1000, subsample=0.7; total time= 5.1s
[CV] END alpha=1, colsample_bytree=0.7, gamma=1, lambda=10, learning_rate=0.10666666666666666, max_depth=9, min_child_weight=5, n_estimators=500, subsample=0.7; total time= 5.5s
[CV] END alpha=10, colsample_bytree=1.0, gamma=1, lambda=10, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=700, subsample=1.0; total time= 4.8s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.2677777777777778, max_depth=9, min_child_weight=5, n_estimators=700, subsample=0.5; total time= 7.6s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0, lambda=0, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.5; total time= 3.7s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.1711111111111111, max_depth=9, min_child_weight=1, n_estimators=700, subsample=1.0; total time= 5.5s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.5; total time= 5.5s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.3, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=1.0; total time= 5.6s
[CV] END alpha=0, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 2.0s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=3, n_estimators=1000, subsample=0.7; total time= 5.1s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=1, n_estimators=300, subsample=1.0; total time= 1.5s
[CV] END alpha=1, colsample_bytree=0.7, gamma=1, lambda=10, learning_rate=0.10666666666666666, max_depth=9, min_child_weight=5, n_estimators=500, subsample=0.7; total time= 5.6s
[CV] END alpha=10, colsample_bytree=1.0, gamma=1, lambda=10, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=700, subsample=1.0; total time= 4.9s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=0, learning_rate=0.2677777777777778, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.7; total time= 1.0s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=0, learning_rate=0.2677777777777778, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=10, colsample_bytree=0.7, gamma=1, lambda=1, learning_rate=0.1711111111111111, max_depth=3, min_child_weight=3, n_estimators=300, subsample=0.5; total time= 1.5s
[CV] END alpha=1, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=1, n_estimators=300, subsample=0.5; total time= 1.7s
[CV] END alpha=1, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=7, min_child_weight=7, n_estimators=300, subsample=0.7; total time= 3.0s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 1.6s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=1, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=100, subsample=0.5; total time= 1.3s
Best Parameters: {'subsample': 0.7, 'n_estimators': 700, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.1388888888888889, 'lambda': 5, 'gamma': 0.2, 'colsample_bytree': 0.7, 'alpha': 0}
best_params_random = random_search.best_params_ # Use best from Random/Grid/Bayesian search
xgb_best_random = XGBClassifier(**best_params_random, random_state=1234)
xgb_best_random.fit(X_train_reduced, y_train)
y_pred_best_random = xgb_best_random.predict(X_test_reduced)
# Evaluation Metrics
print("XGBoost Model:")
print(classification_report(y_test, y_pred_best_random))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_best_random))
XGBoost Model:
precision recall f1-score support
0 0.94 0.99 0.96 10951
1 0.98 0.93 0.96 9403
accuracy 0.96 20354
macro avg 0.96 0.96 0.96 20354
weighted avg 0.96 0.96 0.96 20354
ROC-AUC Score: 0.9591421681333901
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np
import pandas as pd
def evaluate_classification_model(model, model_name, X_train, y_train, cv_folds=10, random_state=1234):
print(f"Running Cross-Validation for {model_name}...")
# Define Stratified K-Fold CV
cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
# Perform Cross-Validation
accuracy_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="accuracy")
auc_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="roc_auc")
f1_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="f1")
# Store results
results = {
"Model": model_name,
"Mean Accuracy": np.mean(accuracy_scores),
"Mean ROC-AUC": np.mean(auc_scores),
"Mean F1-Score": np.mean(f1_scores),
"Std Dev ROC-AUC": np.std(auc_scores),
"Std Dev F1-Score": np.std(f1_scores),
}
# Convert to DataFrame for readability
results_df = pd.DataFrame([results])
return results_df
# Evaluate Logistic Regression separately
log_reg_results = evaluate_classification_model(log_reg, "Logistic Regression", X_train_reduced, y_train)
print(log_reg_results)
Running Cross-Validation for Logistic Regression...
Model Mean Accuracy Mean ROC-AUC Mean F1-Score \
0 Logistic Regression 0.958151 0.974565 0.952913
Std Dev ROC-AUC Std Dev F1-Score
0 0.00088 0.001719
# Evaluate Best XGBoost (Randomized Search)
xgb_random_results = evaluate_classification_model(xgb_best_random, "XGBoost (Randomized Search)", X_train_reduced, y_train)
print(xgb_random_results)
Running Cross-Validation for XGBoost (Randomized Search)...
Model Mean Accuracy Mean ROC-AUC Mean F1-Score \
0 XGBoost (Randomized Search) 0.95895 0.987159 0.954298
Std Dev ROC-AUC Std Dev F1-Score
0 0.000576 0.001426
[CV] END alpha=10, colsample_bytree=1.0, gamma=1, lambda=10, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=700, subsample=1.0; total time= 4.6s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=0, learning_rate=0.2677777777777778, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.7; total time= 1.0s
[CV] END alpha=10, colsample_bytree=0.7, gamma=1, lambda=1, learning_rate=0.1711111111111111, max_depth=3, min_child_weight=3, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.2, lambda=5, learning_rate=0.10666666666666666, max_depth=5, min_child_weight=7, n_estimators=100, subsample=0.7; total time= 0.8s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0.2, lambda=1, learning_rate=0.2677777777777778, max_depth=7, min_child_weight=7, n_estimators=500, subsample=0.5; total time= 4.5s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 1.6s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0, lambda=0, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.5; total time= 3.9s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=1, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=100, subsample=0.5; total time= 1.4s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.1711111111111111, max_depth=9, min_child_weight=1, n_estimators=700, subsample=1.0; total time= 5.5s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.5; total time= 5.5s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=1, learning_rate=0.23555555555555557, max_depth=3, min_child_weight=5, n_estimators=1000, subsample=0.5; total time= 4.9s
[CV] END alpha=0, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=5, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 2.1s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0.5, lambda=10, learning_rate=0.1388888888888889, max_depth=3, min_child_weight=1, n_estimators=1000, subsample=0.5; total time= 4.9s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0.5, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=7, n_estimators=300, subsample=0.5; total time= 1.5s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=1, n_estimators=300, subsample=1.0; total time= 1.5s
[CV] END alpha=1, colsample_bytree=0.7, gamma=1, lambda=10, learning_rate=0.10666666666666666, max_depth=9, min_child_weight=5, n_estimators=500, subsample=0.7; total time= 5.5s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.7; total time= 5.7s
[CV] END alpha=10, colsample_bytree=0.7, gamma=1, lambda=1, learning_rate=0.1711111111111111, max_depth=3, min_child_weight=3, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.2, lambda=5, learning_rate=0.10666666666666666, max_depth=5, min_child_weight=7, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0.2, lambda=1, learning_rate=0.2677777777777778, max_depth=7, min_child_weight=7, n_estimators=500, subsample=0.5; total time= 4.4s
[CV] END alpha=0, colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=300, subsample=0.7; total time= 1.6s
[CV] END alpha=1, colsample_bytree=0.5, gamma=0, lambda=1, learning_rate=0.3, max_depth=9, min_child_weight=3, n_estimators=100, subsample=0.5; total time= 1.3s
[CV] END alpha=1, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=7, min_child_weight=7, n_estimators=300, subsample=0.7; total time= 3.0s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0, lambda=0, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.5; total time= 3.6s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.7; total time= 5.5s
[CV] END alpha=10, colsample_bytree=0.7, gamma=1, lambda=1, learning_rate=0.1711111111111111, max_depth=3, min_child_weight=3, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=1, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=1, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=1, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=7, min_child_weight=7, n_estimators=300, subsample=0.7; total time= 3.1s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0, lambda=0, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.5; total time= 3.6s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.1711111111111111, max_depth=9, min_child_weight=1, n_estimators=700, subsample=1.0; total time= 5.5s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.5; total time= 5.3s
[CV] END alpha=0, colsample_bytree=0.5, gamma=0, lambda=10, learning_rate=0.3, max_depth=5, min_child_weight=3, n_estimators=1000, subsample=1.0; total time= 5.8s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0.1, lambda=0, learning_rate=0.20333333333333334, max_depth=7, min_child_weight=3, n_estimators=300, subsample=0.7; total time= 3.0s
[CV] END alpha=10, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=3, n_estimators=1000, subsample=0.7; total time= 5.2s
[CV] END alpha=10, colsample_bytree=0.7, gamma=0, lambda=5, learning_rate=0.2677777777777778, max_depth=3, min_child_weight=1, n_estimators=300, subsample=1.0; total time= 1.5s
[CV] END alpha=10, colsample_bytree=1.0, gamma=1, lambda=10, learning_rate=0.01, max_depth=5, min_child_weight=3, n_estimators=700, subsample=1.0; total time= 4.9s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0.5, lambda=0, learning_rate=0.1711111111111111, max_depth=7, min_child_weight=1, n_estimators=700, subsample=0.7; total time= 5.7s
[CV] END alpha=5, colsample_bytree=1.0, gamma=0.5, lambda=0, learning_rate=0.2677777777777778, max_depth=5, min_child_weight=3, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=5, colsample_bytree=0.5, gamma=0.2, lambda=5, learning_rate=0.10666666666666666, max_depth=5, min_child_weight=7, n_estimators=100, subsample=0.7; total time= 0.9s
[CV] END alpha=1, colsample_bytree=1.0, gamma=0, lambda=1, learning_rate=0.07444444444444444, max_depth=3, min_child_weight=1, n_estimators=300, subsample=0.5; total time= 1.6s
[CV] END alpha=1, colsample_bytree=1.0, gamma=1, lambda=5, learning_rate=0.01, max_depth=7, min_child_weight=7, n_estimators=300, subsample=0.7; total time= 3.2s
[CV] END alpha=5, colsample_bytree=0.7, gamma=0, lambda=0, learning_rate=0.3, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.5; total time= 3.6s
The End¶